第9课:Spark Streaming源码解读之Receiver在Driver的精妙实现全生命周期彻底研究和思考

首先随着应用程序的启动而启动receiverspark core不知道启动了一个receiver

启动一个jobRDDtransformaction,这个job里只有一个partition,而且这个有个特殊的地方partition在于,一个partition只有一个数据成员,这个数据成员就是receiver

可能出现不好的结果

1.在同一个executer上启动多个receiver,可能导致负载不均衡

2.启动一个receivers失败

   运行过程中有可能基于每个task启动executer挂掉了,task为单位启动的receiver也会挂掉。Task runtry后就会导致任务失败。

 

 不同的partition对应着不同的receiver

InputDStream只启动一个receiver

 

源码

StreamingContext.scala

def socketTextStream(
   
hostname: String,
   
port: Int,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
 
): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
 
socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
}

DStreamGraph.scala

private val inputStreams = new ArrayBuffer[InputDStream[_]]()
private val outputStreams = new ArrayBuffer[DStream[_]]()

 

JobScheduler

 

def start(): Unit = synchronized {
 
if (eventLoop != null) return // scheduler has already been started

 
logDebug("Starting JobScheduler")
 
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
   
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

   
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
 
}
  eventLoop.start()

 
// attach rate controllers of input streams to receive batch completion updates
 
for {
   
inputDStream <- ssc.graph.getInputStreams
   
rateController <- inputDStream.rateController
 
} ssc.addStreamingListener(rateController)

 
listenerBus.start(ssc.sparkContext)
 
receiverTracker = new ReceiverTracker(ssc)
 
inputInfoTracker = new InputInfoTracker(ssc)
 
receiverTracker.start()
 
jobGenerator.start()
 
logInfo("Started JobScheduler")
}

ReceiverTracker

/** Start the endpoint and receiver execution thread. */
def start(): Unit = synchronized {
 
if (isTrackerStarted) {
   
throw new SparkException("ReceiverTracker already started")
 
}

  if (!receiverInputStreams.isEmpty) {
   
endpoint = ssc.env.rpcEnv.setupEndpoint(
     
"ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv))
   
if (!skipReceiverLaunch) launchReceivers()
   
logInfo("ReceiverTracker started")
   
trackerState = Started
 
}
}

/**
 
* Get the receivers from the ReceiverInputDStreams, distributes them to the
 * worker nodes as a parallel collection, and runs them.
 */
private def launchReceivers(): Unit = {

//一个输入的数据来源,只产生一个receiver
 
val receivers = receiverInputStreams.map(nis => {
   
val rcvr = nis.getReceiver()
   
rcvr.setReceiverId(nis.id)
   
rcvr
  })
  runDummySparkJob()
  logInfo("Starting " + receivers.length + " receivers")
 
endpoint.send(StartAllReceivers(receivers))
}

private var endpoint: RpcEndpointRef = null

receive

override def receive: PartialFunction[Any, Unit] = {
 
// Local messages
 
case StartAllReceivers(receivers) =>

//确定一下receiver要运行在哪些executer上
    val scheduledLocations = schedulingPolicy.scheduleReceivers(receivers, getExecutors)
   
for (receiver <- receivers) {
     
val executors = scheduledLocations(receiver.streamId)
     
updateReceiverScheduledExecutors(receiver.streamId, executors)
      receiverPreferredLocations(receiver.streamId) = receiver.preferredLocation
     
startReceiver(receiver, executors)
    }
  case RestartReceiver(receiver) =>
   
// Old scheduled executors minus the ones that are not active any more
   
val oldScheduledExecutors = getStoredScheduledExecutors(receiver.streamId)
   
val scheduledLocations = if (oldScheduledExecutors.nonEmpty) {
       
// Try global scheduling again
       
oldScheduledExecutors
     
} else {
       
val oldReceiverInfo = receiverTrackingInfos(receiver.streamId)
       
// Clear "scheduledLocations" to indicate we are going to do local scheduling
       
val newReceiverInfo = oldReceiverInfo.copy(
         
state = ReceiverState.INACTIVE, scheduledLocations = None)
       
receiverTrackingInfos(receiver.streamId) = newReceiverInfo
       
schedulingPolicy.rescheduleReceiver(
         
receiver.streamId,
          receiver.preferredLocation,
          receiverTrackingInfos,
         
getExecutors)
      }
    // Assume there is one receiver restarting at one time, so we don't need to update
   
// receiverTrackingInfos
   
startReceiver(receiver, scheduledLocations)
 
case c: CleanupOldBlocks =>
   
receiverTrackingInfos.values.flatMap(_.endpoint).foreach(_.send(c))
 
case UpdateReceiverRateLimit(streamUID, newRate) =>
   
for (info <- receiverTrackingInfos.get(streamUID); eP <- info.endpoint) {
     
eP.send(UpdateRateLimit(newRate))
    }
  // Remote messages
 
case ReportError(streamId, message, error) =>
   
reportError(streamId, message, error)
}

startReceiver

/**
 
* Start a receiver along with its scheduled executors
 */
private def startReceiver(
   
receiver: Receiver[_],
    scheduledLocations: Seq[TaskLocation]): Unit = {
 
def shouldStartReceiver: Boolean = {
   
// It's okay to start when trackerState is Initialized or Started
   
!(isTrackerStopping || isTrackerStopped)
 
}

  val receiverId = receiver.streamId
 
if (!shouldStartReceiver) {
   
onReceiverJobFinish(receiverId)
    return
 
}

 
val checkpointDirOption = Option(ssc.checkpointDir)
 
val serializableHadoopConf =
   
new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration)

 
// Function to start the receiver on the worker node
 
val startReceiverFunc: Iterator[Receiver[_]] => Unit =
   
(iterator: Iterator[Receiver[_]]) => {
     
if (!iterator.hasNext) {
    
   throw new SparkException(
         
"Could not start receiver as object not found.")
     
}
      if (TaskContext.get().attemptNumber() == 0) {
       
val receiver = iterator.next()
       
assert(iterator.hasNext == false)
       
val supervisor = new ReceiverSupervisorImpl(
         
receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
        supervisor.start()
        supervisor.awaitTermination()
      } else {
       
// It's restarted by TaskScheduler, but we want to reschedule it again. So exit it.
     
}
   
}

  // Create the RDD using the scheduledLocations to run the receiver in a Spark job
 
val receiverRDD: RDD[Receiver[_]] =
   
if (scheduledLocations.isEmpty) {
     
ssc.sc.makeRDD(Seq(receiver), 1)
   
} else {
     
val preferredLocations = scheduledLocations.map(_.toString).distinct
     
ssc.sc.makeRDD(Seq(receiver -> preferredLocations))
   
}
  receiverRDD.setName(s"Receiver $receiverId")
 
ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
 
ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite()))

  val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
  
   receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())
 
// We will keep restarting the receiver job until ReceiverTracker is stopped
 
future.onComplete {
   
case Success(_) =>
     
if (!shouldStartReceiver) {
       
onReceiverJobFinish(receiverId)
      } else {
       
logInfo(s"Restarting Receiver $receiverId")
       
self.send(RestartReceiver(receiver))
      }
    case Failure(e) =>
     
if (!shouldStartReceiver) {
       
onReceiverJobFinish(receiverId)
      } else {
       
logError("Receiver has been stopped. Try to restart it.", e)
       
logInfo(s"Restarting Receiver $receiverId")
       
self.send(RestartReceiver(receiver))
      }
  }(submitJobThreadPool)
 
logInfo(s"Receiver ${receiver.streamId} started")
}

 

作业是一次启动一个receiver还是一次把所有的receiver都启动起来

receiverRDD.setName(s"Receiver $receiverId")说明只有一个启动,每个receiver都会启动一个作业

 

 

startReceiver的调用startReceiver(receiver, scheduledLocations),在receiver的循环里调用startReceiver(receiver, executors)

 

onReceiverJobFinish

/**
 
* Call when a receiver is terminated. It means we won't restart its Spark job.
 */
private def onReceiverJobFinish(receiverId: Int): Unit = {
 
receiverJobExitLatch.countDown()
 
receiverTrackingInfos.remove(receiverId).foreach { receiverTrackingInfo =>
   
if (receiverTrackingInfo.state == ReceiverState.ACTIVE) {
     
logWarning(s"Receiver $receiverId exited but didn't deregister")
   
}
  }
}

 

ReceiverInputDStream

abstract class ReceiverInputDStream[T: ClassTag](ssc_ : StreamingContext)
 
extends InputDStream[T](ssc_) {

 

ReceiverSupervisorImpl

/** Start the supervisor */
def start() {
 
onStart()
  startReceiver()
}

 

override protected def onStart() {
 
registeredBlockGenerators.foreach { _.start() }
}

/** Start receiver */
def startReceiver(): Unit = synchronized {
 
try {
   
if (onReceiverStart()) {
     
logInfo("Starting receiver")
     
receiverState = Started
     
receiver.onStart()
     
logInfo("Called receiver onStart")
   
} else {
     
// The driver refused us
     
stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None)
   
}
  } catch {
   
case NonFatal(t) =>
     
stop("Error starting receiver " + streamId, Some(t))
 
}
}

 

onReceiverStart

override protected def onReceiverStart(): Boolean = {
 
val msg = RegisterReceiver(
   
streamId, receiver.getClass.getSimpleName, host, executorId, endpoint)
 
trackerEndpoint.askWithRetry[Boolean](msg)
}

/** Remote RpcEndpointRef for the ReceiverTracker */
private val trackerEndpoint = RpcUtils.makeDriverRef("ReceiverTracker", env.conf, env.rpcEnv)

 

ReceiverSchedulingPolicy.scala

//尽量平均分配

def scheduleReceivers(
   
receivers: Seq[Receiver[_]],
   
executors: Seq[ExecutorCacheTaskLocation]): Map[Int, Seq[TaskLocation]] = {
 
if (receivers.isEmpty) {
   
return Map.empty
 
}

  if (executors.isEmpty) {
   
return receivers.map(_.streamId -> Seq.empty).toMap
 
}

  val hostToExecutors = executors.groupBy(_.host)
 
val scheduledLocations = Array.fill(receivers.length)(new mutable.ArrayBuffer[TaskLocation])
 
val numReceiversOnExecutor = mutable.HashMap[ExecutorCacheTaskLocation, Int]()
 
// Set the initial value to 0
 
executors.foreach(e => numReceiversOnExecutor(e) = 0)

 
// Firstly, we need to respect "preferredLocation". So if a receiver has "preferredLocation",
 
// we need to make sure the "preferredLocation" is in the candidate scheduled executor list.
 
for (i <- 0 until receivers.length) {
   
// Note: preferredLocation is host but executors are host_executorId
   
receivers(i).preferredLocation.foreach { host =>
     
hostToExecutors.get(host) match {
       
case Some(executorsOnHost) =>
         
// preferredLocation is a known host. Select an executor that has the least receivers in
         
// this host
         
val leastScheduledExecutor =
           
executorsOnHost.minBy(executor => numReceiversOnExecutor(executor))
          scheduledLocations(i) += leastScheduledExecutor
          numReceiversOnExecutor(leastScheduledExecutor) =
            numReceiversOnExecutor(leastScheduledExecutor) + 1
       
case None =>
         
// preferredLocation is an unknown host.
         
// Note: There are two cases:
          // 1. This executor is not up. But it may be up later.
          // 2. This executor is dead, or it's not a host in the cluster.
          // Currently, simply add host to the scheduled executors.

          // Note: host could be `HDFSCacheTaskLocation`, so use `TaskLocation.apply` to handle
          // this case
         
scheduledLocations(i) += TaskLocation(host)
     
}
    }
  }

 

 

 

 博主:罗白莲
资料来源于:王家林(Spark版本定制班课程)
新浪微博:http://www.weibo.com/ilovepains

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值