1, 还是从案例开始顺藤摸瓜
object NetworkWordCount { def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: NetworkWordCount <hostname> <port>") System.exit(1) } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkWordCount").setMaster("local[5]") val ssc = new StreamingContext(sparkConf, Seconds(40)) // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. val lines = ssc.socketTextStream("192.168.4.41", 9999, StorageLevel.MEMORY_AND_DISK_SER) val words = lines.flatMap(_.split(" ")) val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() ssc.awaitTermination() } }
2,ssc.start()==>会调用JobScheduler.start()
===》先简单说一下ReceiverTracker、InputInfoTracker:
a, ReceiverTracker:这个类是在driver上运行的,会管理每个Reciver在Executor
b, InputInfoTracker:这个类是管理输入流的数据统计用的,然后给StreamingListener进行监控
private[streaming] class JobScheduler(val ssc: StreamingContext) extends Logging { 。。。 def start(): Unit = synchronized { 。。。。 //处理ReceiverInputDstream的数据源,如SocketInputDstream,FlumePollingInputDstream,FlumeInputDsteam等。 receiverTracker = new ReceiverTracker(ssc) inputInfoTracker = new InputInfoTracker(ssc) receiverTracker.start() jobGenerator.start() logInfo("Started JobScheduler") }
3,进入receiverTracker.start()方法,将ReceiverTrackerEndpoint注册到RpcEnv中,并且将放Receiver放到Executor上执行。
/** Start the endpoint and receiver execution thread. */ def start(): Unit = synchronized { if (isTrackerStarted) { throw new SparkException("ReceiverTracker already started") } // receiverInputStreams会判断DStreamGraphic的inputDstreams是否有ReceiverInputDStream的子类,如当前SocketInputDstream if (!receiverInputStreams.isEmpty) { //将ReceiverTrackerEndpoint注册到RpcEnv容器, receiverTracker在driver中运行,会监控整个集群中的Receiver,
Receiver反过来要向ReceiverTrackerEndpoint汇报自己的状态,接收的数据,包括生命周期等信息 endpoint = ssc.env.rpcEnv.setupEndpoint( "ReceiverTracker", new ReceiverTrackerEndpoint(ssc.env.rpcEnv)) //调用launchReceivers() if (!skipReceiverLaunch) launchReceivers() logInfo("ReceiverTracker started") trackerState = Started } }
4,调用launchReceivers(),该方法的作用是将ReceiverInputDStreams的子类中的Receiver分发到Worker节点中的executor进行执行
private def launchReceivers(): Unit = { // ReceiverInputDstream父类是InputDstream. // InputDstream子类中没有实现ReceiverInputDstream的子类.如FileInputDstream,DirectKafkaInputDStream就不需要Receiver这一层,肯定效率会更高 val receivers = receiverInputStreams.map(nis => { //当前的Receiver就是SocketReceiver val rcvr = nis.getReceiver()
//inputStream中的id都是唯一的,在InputDstream中得到,并且将这个id,赋值给Receiver的streamId rcvr.setReceiverId(nis.id) rcvr }) //其中runDummySparkJob()为了确保所有节点活着,而且避免所有的receivers集中在一个节点上 runDummySparkJob() logInfo("Starting " + receivers.length + " receivers") //将所有ReceiverInputDstream子类放到StartAllReceivers的case class中,发给ReceiverTrackerEndpoint //send方法不需接收回复信息,所以会在 ReceiverTrackerEndpoint中的receive来接收StartAllReceivers. endpoint.send(StartAllReceivers(receivers)) }
5,从源码看出ReceiverTrackerEndpoint是一个线程安全。
==》重点看一下receiver这个偏函数模式匹配对应的StartAllReceivers
/** RpcEndpoint to receive messages from the receivers. */ private class ReceiverTrackerEndpoint(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint { // TODO Remove this thread pool after https://github.com/apache/spark/issues/7385 is merged private val submitJobThreadPool = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("submit-job-thread-pool")) private val walBatchingThreadPool = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("wal-batching-thread-pool")) @volatile private var active: Boolean = true override def receive: PartialFunction[Any, Unit] = { // Local messages case StartAllReceivers(receivers) => // getExecutors得到Executors列表,这样就可以确定Receiver可以运行在哪个Executor上了; // scheduleReceivers该方法得到Map((streamId,ArrayBuffer[ExecutorCacheTaskLocation]),....),
//即Map(0,ArrayBuffer("executor_localhost_driver")),对于scheduleReceivers源码的跟踪可以看下面 //这边可以得到一个结论就一个receiver对应一个Executor val scheduledLocations = schedulingPolicy.scheduleReceivers(receivers, getExecutors) for (receiver <- receivers) { //该方法可以根据receiver的id找到指定的Executor(即上面的ArrayBuffer[ExecutorCacheTaskLocation]) val executors = scheduledLocations(receiver.streamId) //该方法给当前ReceiverTracker中成员赋值:receiverTrackingInfos = new HashMap[Int, ReceiverTrackingInfo] updateReceiverScheduledExecutors(receiver.streamId, executors) //receiverPreferredLocations是用于存储Receiver有preferedLocation成员的,因为SocketReceiver没有实现这个方法,
preferredLocation的值是None receiverPreferredLocations(receiver.streamId) = receiver.preferredLocation //将每个Receiver放到对应的Executor上执行,跟进入发现每个Receiver对应一次spark的submitJob startReceiver(receiver, executors) } case RestartReceiver(receiver) =>……
a,跟踪一下val scheduledLocations =schedulingPolicy.scheduleReceivers(…)方法:
该方法得到Map((streamId,ArrayBuffer[ExecutorCacheTaskLocation]),....),即每个Receiver对应在哪个Executor上执行。结构如下
(0,ArrayBuffer("executor_localhost_driver"))
==》同时复习一下scala的api的使用
* * 这边可以得到一个结论就一个receiver对应一个Executor */ def scheduleReceivers( receivers: Seq[Receiver[_]], executors: Seq[ExecutorCacheTaskLocation]): Map[Int, Seq[TaskLocation]] = { //ExecutorCacheTaskLocation是TaskLocation的子类 if (receivers.isEmpty) { //scala> Map.empty //res0: scala.collection.immutable.Map[Nothing,Nothing] = Map() return Map.empty } if (executors.isEmpty) { //var wordList = Seq.empty[String] ==> wordList: Seq[String] = List() //如果没有指定泛型,默认是Nothing return receivers.map(_.streamId -> Seq.empty).toMap } //groupBy()按相同的host分组,返回Map[host,List[ExecutorCacheTaskLocation]],从这个角度上看scala的api比javaApi丰富 val hostToExecutors = executors.groupBy(_.host) //按receivers有多少个,返回的List[ArrayBuffer[TaskLocation]]元素就有多少个取决于receivers.length val scheduledLocations = Array.fill(receivers.length)(new mutable.ArrayBuffer[TaskLocation]) /** * 代表数据存储在 executor 的内存中,也就是这个 partition的recode 被 cache到内存了 * 比如 KafkaRDD 会将 partitions 都 cache 到内存,其 toString 方法返回的格式如 executor_$host_$executorId */ val numReceiversOnExecutor = mutable.HashMap[ExecutorCacheTaskLocation, Int]() // Set the initial value to 0.将HashMap中的每个key(ExecutorCacheTaskLocation)对应的值设置成0 executors.foreach(e => numReceiversOnExecutor(e) = 0) // Firstly, we need to respect "preferredLocation". So if a receiver has "preferredLocation"
,we need to make sure the "preferredLocation" is in the candidate scheduled executor list. //它的意思查看Receiver是否preferredLocation数据本地性,如果有要将有数据本地性的receiver放到executor调度列表中 //对于SocketReceiver它是没有实现这个方法,返回的是None,所以不会进入receivers(i).preferredLocation.foreach方法中 //这是一个左闭右开[)的方式,因为length值是5,i最大为到4,就像for循环一样 for (i <- 0 until receivers.length) { // Note: preferredLocation is host but executors are host_executorId //数据本地性只指明的是host的主机名,而executors除了有host还有executorId 如:host_executorId receivers(i).preferredLocation.foreach { host => hostToExecutors.get(host) match { //executorsOnHost若有值得到List[ExecutorCacheTaskLocation] case Some(executorsOnHost) => // preferredLocation is a known host. Select an executor that has the least receivers in this host //minBy返回numReceiversOnExecutor的value元素中最小值对应executorsOnHost集合中元素.scala源码还是特有意思的,
里面就是运用了Ordereing隐式参数,所以要用这个minBy之前来比较之前,集合的元素,需要有Ordering[T]对应的隐式值 val leastScheduledExecutor = executorsOnHost.minBy(executor => numReceiversOnExecutor(executor)) scheduledLocations(i) += leastScheduledExecutor numReceiversOnExecutor(leastScheduledExecutor) = numReceiversOnExecutor(leastScheduledExecutor) + 1 case None => // preferredLocation is an unknown host. // Note: There are two cases: // 1. This executor is not up. But it may be up later. // 2. This executor is dead, or it's not a host in the cluster. // Currently, simply add host to the scheduled executors. // Note: host could be `HDFSCacheTaskLocation`, so use `TaskLocation.apply` to handle this case scheduledLocations(i) += TaskLocation(host) } } } // For those receivers that don't have preferredLocation, make sure we assign at least one executor to them. //对于那些没有preferredLocation的接收器,确保至少分配一个执行者给他们 //上面初始化时将scheduledLocations 对应List[ArrayBuffer[TaskLocation]],里面ArrayBuffer都是空的,
即每一次循环时scheduledLocationsForOneReceiver就是一个ArrayBuffer[TaskLocation] for (scheduledLocationsForOneReceiver <- scheduledLocations.filter(_.isEmpty)) { // Select the executor that has the least receivers //会将numReceiversOnExecutor这个mutable.HashMap[ExecutorCacheTaskLocation, Int]()拿第1个元素
给ordering比较取出最小的元素 map的kv元素可以转成tuple val (leastScheduledExecutor, numReceivers) = numReceiversOnExecutor.minBy(_._2) //将leastScheduledExecutor对应的ExecutorCacheTaskLocation放到ArrayBuffer[TaskLocation]数组中 scheduledLocationsForOneReceiver += leastScheduledExecutor //然后将mutable.HashMap[ExecutorCacheTaskLocation, Int]()的value对应加1 numReceiversOnExecutor(leastScheduledExecutor) = numReceivers + 1 } // Assign idle executors to receivers that have less executors //赋空闲的executors给哪些没有executors的receivers,下面语句得到idleExecutors[ExecutorCacheTaskLocation]集合 val idleExecutors = numReceiversOnExecutor.filter(_._2 == 0).map(_._1) for (executor <- idleExecutors) { // Assign an idle executor to the receiver that has least candidate executors. val leastScheduledExecutors = scheduledLocations.minBy(_.size) leastScheduledExecutors += executor } //zip会将两个集合中索引相同的元素放在新集合的tuple元素中,scheduledLocations对应的List[ArrayBuffer[TaskLocation]] //因为scheduledLocations里面的元素索引就是扫完receivers元素索引是一样的,得到Map((streamId,ArrayBuffer[ExecutorCacheTaskLocation]),....) //其中streamId就是InputDStream的id //这边可以得到一个结论就一个receiver对应一个Executor receivers.map(_.streamId).zip(scheduledLocations).toMap }
6,跟踪startReceiver(receiver, executors),该方法是将当前receiver放到对应的Executors上面执行。
/** * Start a receiver along with its scheduled executors * scheduledLocations:指定Receiver在具体的物理机器上执行,它虽然是一个集合, * 从StartAllReceivers调用来看这个Seq[TaskLocation]集合只有一个元素与receiver对应; */ private def startReceiver( receiver: Receiver[_], scheduledLocations: Seq[TaskLocation]): Unit = { //判断Receiver的状态是否正常,判断的是trackerState,当startReceiver时,它是Started,不等于stopping,stopped 所以返回true def shouldStartReceiver: Boolean = { // It's okay to start when trackerState is Initialized or Started !(isTrackerStopping || isTrackerStopped) } //receiver的streamId就是InputDstream的id val receiverId = receiver.streamId //如果不需要启动Receiver则会调用onReceiverJobFinish if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) return } val checkpointDirOption = Option(ssc.checkpointDir) val serializableHadoopConf = new SerializableConfiguration(ssc.sparkContext.hadoopConfiguration) // Function to start the receiver on the worker node //该方法封装了在worker上启动receiver的动作 val startReceiverFunc: Iterator[Receiver[_]] => Unit = (iterator: Iterator[Receiver[_]]) => { if (!iterator.hasNext) { throw new SparkException( "Could not start receiver as object not found.") } //得到当前活动的TaskContext, attemptNumber:表示这个任务尝试了多少次。 第一个任务尝试将被分配 attemptNumber = 0,
随后的尝试将有增加的尝试次数。 if (TaskContext.get().attemptNumber() == 0) { val receiver = iterator.next() //任务第一次进来时,iterator被上面next之后里面就没有元素。不然会报错 assert(iterator.hasNext == false) //ReceiverSupervisorImpl是Receiver的监控器,同时负责数据的写等操作,也就是每个周期receiver得到的数据写入到BlockGenerator中,
然后让ReceiverInputStream中的compute方法将每个周期内的数据从BlockGenerator取出,然后写入RDD中 val supervisor = new ReceiverSupervisorImpl( receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption) //背后是调用ReceiverSupervisor父类实现方法,最终由ReceiverSupervisorImpl子类实现onStart方法 /*def start() { onStart() startReceiver() }*/ supervisor.start() supervisor.awaitTermination() } else {
//意思是当.attemptNumber大于0时,表示重启TaskScheduler,Receiver不需要重启,直接退出就可以 // It's restarted by TaskScheduler, but we want to reschedule it again. So exit it. } } // Create the RDD using the scheduledLocations to run the receiver in a Spark job val receiverRDD: RDD[Receiver[_]] = if (scheduledLocations.isEmpty) { ssc.sc.makeRDD(Seq(receiver), 1) } else { //得到“executor_localhost_driver”这样的集合 val preferredLocations = scheduledLocations.map(_.toString).distinct //会得到Rdd[(receiver,List[preferredLocations])] ssc.sc.makeRDD(Seq(receiver -> preferredLocations)) } receiverRDD.setName(s"Receiver $receiverId") //给rdd设置名称 ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId") ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite())) //每个Receiver对应一次spark的submitJob val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit]( receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ()) // We will keep restarting the receiver job until ReceiverTracker is stopped future.onComplete { case Success(_) => if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logInfo(s"Restarting Receiver $receiverId") //如果失败重新执行 self.send(RestartReceiver(receiver)) } case Failure(e) => if (!shouldStartReceiver) { onReceiverJobFinish(receiverId) } else { logError("Receiver has been stopped. Try to restart it.", e) logInfo(s"Restarting Receiver $receiverId") self.send(RestartReceiver(receiver)) } }(submitJobThreadPool) //使用线程池并发执行recevier logInfo(s"Receiver ${receiver.streamId} started") }
到此 每个Receiver通过如下代码,提交到对应的Executor中进行执行。
val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit](
receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ())