spark-core_14:Worker源码分析2-Worker与Master通信

29 篇文章 4 订阅

承接上文,在上文中Worker的onStart(){。。。。registerWithMaster() 。。。}将自己的信息注册到Master上

private def registerWithMaster() {
  // onDisconnected may be triggered multiple times, so don't attempt registration
  // if there are outstanding registration attempts scheduled.
  //避免重复注册,首次进来registrationRetryTimer:Option[JScheduledFuture[_]]是None
  registrationRetryTimer match {
    case None =>
      registered = false
      //注册Worker,注册成功将registered变为true
      registerMasterFutures = tryRegisterAllMasters()
      connectionAttemptCount = 0
//重新联接数开始为0,定时任务会定时检查registered,如果是false,
//这个调度只要没有注册到master就会一直在注册。如果注册上就会将线程池的线程Future.cancel(true)掉
// 会延迟INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS之后,再按INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS周期进行调度
//因为由于发生网络故障或主站故障,会向master重新注册。 如果超过重新登记尝试阈值,则worker将退出并出现错误。 请注意,为了线程安全,
只能从rpcEndpoint调用。


     
registrationRetryTimer = Some(forwordMessageScheduler.scheduleAtFixedRate(
       
new Runnable{
         
override def run(): Unit = Utils.tryLogNonFatalError {
           
Option(self).foreach(_.send(ReregisterWithMaster))
          }
        },
       
INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS, //round返回接近参数的long值,1-15之间的值
        INITIAL_REGISTRATION_RETRY_INTERVAL_SECONDS,
       
TimeUnit.SECONDS))
   
case Some(_) =>
     
logInfo("Not spawning another attempt to register with themaster, since there is an" + " attempt scheduled already.")
 
}
}

一、查看一下tryRegisterAllMasters()会对所有Master进行注册

//会给所有master进行注册
private def tryRegisterAllMasters(): Array[JFuture[_]] = {

//masterAddresses:Array(RpcAddress(luyl152, 7077),RpcAddress(luyl153, 7077),RpcAddress(luyl154, 7077))
  masterRpcAddresses.map { masterAddress =>
//从上文可以得到线程池的个数和master的个数一样
    registerMasterThreadPool.submit(new Runnable {
      override def run(): Unit = {
        try {
          logInfo("Connecting to master " + masterAddress + "...")
          //返回和Master通信的EndpointRef,masterEndpoint,同时调用带参registerWithMaster方法,查看registerWithMaster方法
          val masterEndpoint =
            rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
          // 会将Worker信息注册到master上,该方法执行完成之后,表示worker注册完毕
          registerWithMaster(masterEndpoint)
        } catch {
          case ie: InterruptedException => // Cancelled
          case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
        }
      }
    })
  }
}

1,查看registerWithMaster(masterEndpoint)发信息给master进行注册

// 会将Worker信息注册到master上,该方法执行完成之后,表示worker注册完毕
private def registerWithMaster(masterEndpoint: RpcEndpointRef): Unit = {
 
//ask方法会被masterEndpoint的receiveAndReply方法接收到,该ask返回泛型变量:RegisterWorkerResponse
  //ask与askWithRetry效果一样都会调用receiveAndReply方法
  //workerId:worker-20180321165947-luyl153-workrpc的port值, host:work的host
  //port = workrpc.address.port,workerRpcEndPoint  , publicAddress: 当前worker的主机名

  masterEndpoint.ask[RegisterWorkerResponse](RegisterWorker(
   
workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
   
.onComplete {
      // This is a very fast action so we can use"ThreadUtils.sameThread"
      //这是一个非常快速的操作,所以我们可以使用“ThreadUtils.sameThread”

      case Success(msg) =>
       
Utils.tryLogNonFatalError {
          //处理Master的返回信息,心跳也在这部分
          handleRegisterResponse(msg)
       
}
      case Failure(e) =>
       
logError(s"Cannot register with master: ${masterEndpoint.address}", e)
       
System.exit(1)
   
}(ThreadUtils.sameThread)
}

2,查看一下MasterRpcEndpoint对应的receiverAndReply做了什么?

===》这是Master的receiverAndReply方法

override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
  //这是由Worker节点中: registerWithMaster(masterEndpoint: RpcEndpointRef)触发的
  case RegisterWorker(
      id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => {
    logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
      workerHost, workerPort, cores, Utils.megabytesToString(memory)))
    //判断Master的状态是不是standby以及WorkerId是否已存在
    if (state == RecoveryState.STANDBY) {
      context.reply(MasterInStandby)
    } else if (idToWorker.contains(id)) {
      context.reply(RegisterWorkerFailed("Duplicate worker ID"))
    } else {
      //注册worker信息放在WorkerInfo,new WorkerInfo().state的值是WorkerState.ALIVE
      val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
        workerRef, workerUiPort, publicAddress)
//registerWorker()方法会WorkerInfo.state == WorkerState.DEAD或UNKNOWN去掉
//如果有效worker会将当前的WorkerInfo放到workers:HashSet[WorkerInfo]中,放到idToWorker:HashMap[String, WorkerInfo],key就是worker的id
//也放到addressToWorker:HashMap[RpcAddress, WorkerInfo],key就是worker的RpcAddress
      if (registerWorker(worker)) {
        //会用上面recovery模式(启动master,HA是ZooKeeperPersistenceEngine)对应的持久化引擎将WorkerInfo保存zk中
        persistenceEngine.addWorker(worker)
        //会将RegisteredWorker(masterRpcEndPoint及http://luyl152:8080)给workerRpcEndPoint
        context.reply(RegisteredWorker(self, masterWebUiUrl))
        schedule() //这个方法主要用于cluster模式的
      } else {
        val workerAddress = worker.endpoint.address
        logWarning("Worker registration failed. Attempted to re-register worker at same " +
          "address: " + workerAddress)
        context.reply(RegisterWorkerFailed("Attempted to re-register worker at same address: "
          + workerAddress))
      }
    }
  }

 

===>registerWorker()方法会将WorkerState.DEAD或UNKNOWN的worker去掉如果有效worker会将当前的WorkerInfo放到workers:HashSet[WorkerInfo]中,放到idToWorker:HashMap[String, WorkerInfo],key就是worker的id也放到addressToWorker:HashMap[RpcAddress,WorkerInfo],key就是worker的RpcAddress

private def registerWorker(worker: WorkerInfo): Boolean = {
 
// There may be one or more refs to dead workers on thissame node (w/ different ID's),
  // remove them.
  //
移除dead worker
  workers.filter{ w =>
   
(w.host == worker.host &&w.port == worker.port) && (w.state ==WorkerState.DEAD)
 
}.foreach { w =>
    workers -= w
 
}
  //移除Unknownworker
  val workerAddress= worker.endpoint.address
 
//addressToWorker对应HashMap[RpcAddress, WorkerInfo]
  if (addressToWorker.contains(workerAddress)) {
   
val oldWorker= addressToWorker(workerAddress)
   
if (oldWorker.state == WorkerState.UNKNOWN) {
     
// A worker registering from UNKNOWN implies that theworker was restarted during recovery. The old worker must thus be dead, so wewill remove it and accept the new worker.
     
//如果是UNKNOWN表示worker会进行重启,所以需要将它移除
      removeWorker(oldWorker)
   
} else {
     
logInfo("Attempted to re-register worker at same address:" + workerAddress)
     
return false
   
}
 
}
  //将当前的WorkerInfo放到workers:HashSet[WorkerInfo]中,放到idToWorker:HashMap[String,WorkerInfo],key就是worker的id
  //也放到addressToWorker:HashMap[RpcAddress,WorkerInfo],key就是worker的RpcAddress

  workers += worker
 
idToWorker(worker.id) = worker
 
addressToWorker(workerAddress) = worker
 
true
}

 

3,当Master注册成功之后,会将回复RegisteredWorker (这个case class就是RegisterWorkerResponse子类)给master,如上面调用

context.reply(RegisteredWorker(self, masterWebUiUrl))

===》再回到Worker的registerWithMaster()方法中

// 会将Worker信息注册到master上,该方法执行完成之后,表示worker注册完毕
private def registerWithMaster(masterEndpoint: RpcEndpointRef): Unit = {
 
//ask方法会被masterEndpoint的receiveAndReply方法接收到,该ask返回泛型变量:RegisterWorkerResponse
  //ask与askWithRetry效果一样都会调用receiveAndReply方法
  //workerId:worker-20180321165947-luyl153-workrpc的port值, host:work的host
  //port = workrpc.address.port,workerRpcEndPoint  , publicAddress: 当前worker的主机名

  masterEndpoint.ask[RegisterWorkerResponse](RegisterWorker(
   
workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
   
.onComplete {
      // This is a very fast action so we can use"ThreadUtils.sameThread"
      //这是一个非常快速的操作,所以我们可以使用“ThreadUtils.sameThread”

      case Success(msg) =>
       
Utils.tryLogNonFatalError {
          //处理Master的返回信息,心跳也在这部分
          handleRegisterResponse(msg)
       
}
      case Failure(e) =>
       
logError(s"Cannot register with master: ${masterEndpoint.address}", e)
       
System.exit(1)
   
}(ThreadUtils.sameThread)
}

 

====》查看一下worker的handleRegisterResponse(msg)

private def handleRegisterResponse(msg: RegisterWorkerResponse): Unit = synchronized {
 
//上面的masterEndpoint.ask[RegisterWorkerResponse],返回RegisterWorkerResponse的子类
  //RegisteredWorker:表示注册成功

  msg match{
     
//得到RegisteredWorker(masterRpcEndPoint及http://luyl152:8080)给workerRpcEndPoint
    case RegisteredWorker(masterRef, masterWebUiUrl) =>
     
logInfo("Successfully registered with master " + masterRef.address.toSparkURL)
     
//注册成功后,将registered属性设置为true,修改用于和Master通信的masterRef及masterWebUiUrl。
      registered = true
     
//将master对应的RpcEndpoint赋给master: Option[RpcEndpointRef]变量,同时将其它未找到master的线程取消掉
      changeMaster(masterRef, masterWebUiUrl)
     
forwordMessageScheduler.scheduleAtFixedRate(new Runnable{
       
override def run(): Unit = Utils.tryLogNonFatalError {
//定时发送心跳给WorkerRpcEndpoint,由于在send所以肯定在WorkerRpcEndpoint的receive方法中
          self.send(SendHeartbeat)
       
}
      }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS) //15秒发送一次
     
//清除Worker目录
      if (CLEANUP_ENABLED) {
       
logInfo(
          s"Worker cleanup enabled; old applicationdirectories will be deleted in: $workDir")
       
forwordMessageScheduler.scheduleAtFixedRate(new Runnable{
         
override def run(): Unit = Utils.tryLogNonFatalError {
           
self.send(WorkDirCleanup)
          }
        }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS)
     
}

    case RegisterWorkerFailed(message)=>
     
if (!registered) {
       
logError("Worker registration failed: " + message)
       
System.exit(1)
     
}
    case MasterInStandby=>
     
// Ignore. Master not yet ready.
 
}
}

 

===》注册成功之后会每15发送心跳给自己self.send(SendHeartbeat),然后由WorkerRpcEndpoint给Master发送心跳消息

override def receive: PartialFunction[Any, Unit] =synchronized {
 
case SendHeartbeat=>
   
if (connected) {
     
//最终调用masterRef的send方法将Heartbeat("worker-20180321165947-luyl153-workerRpcAddress.port",WorkerRpcEndpoint)信息发送给Master
      sendToMaster(Heartbeat(workerId, self))
   
}

===>sendToMaster()方法会将Heartbeat(workerId,WorkerRpcEndpoint)给Master

 

/**
 * Send a message to the current master.If we have not yet registered successfully with any
 * master, the message will be dropped.
  * 最终调用masterRef的send方法将Heartbeat信息发送给Master
  * Master每60s查看Worker连接情况,Worker端每15s发送一次心跳
  * private val HEARTBEAT_MILLIS =conf.getLong("spark.worker.timeout", 60) * 1000 / 4
  *  message的值: Heartbeat("worker-20180321165947-luyl153-workerRpcAddress.port",WorkerRpcEndpoint)
 */

private def sendToMaster(message: Any): Unit = {
 
//master的值是由handleRegisterResponse(){… 在changeMaster(masterRef,masterWebUiUrl)方法赋的值…}
  master match {
   
case Some(masterRef) =>masterRef.send(message)
   
case None=>
     
logWarning(
        s"Dropping $message because the connection to master has notyet been established")
 
}
}

4,再次进入Master

//receive方法接收EndpointRef send方法发送的信息
override def receive: PartialFunction[Any, Unit] = {
…..
  // 如果发送的是Heartbeat请求,表示心跳检测机制,由worker向master发起的
  //Heartbeat("worker-20180321165947-luyl153-workerRpcAddress.port",WorkerRpcEndpoint)
  case Heartbeat(workerId, worker) => {
    //idToWorker这个HashMap[String, WorkerInfo]里面的workerInfo元素是RegisterWorker时,放进去的
    idToWorker.get(workerId) match {
      case Some(workerInfo) =>
//因为最开始worker注册信息到Master时,就会将WokerInfo放在idToWorker中
        workerInfo.lastHeartbeat = System.currentTimeMillis()
      case None =>
        if (workers.map(_.id).contains(workerId)) {
          logWarning(s"Got heartbeat from unregistered worker $workerId." +
            " Asking it to re-register.")

//如果Master中没有这个WorkerId信息,会发信息给Worker让它重新注册,spark这一点设计上还是很精妙的
//如果Master宕机,standby成为Master时,就会发新的masterUrl,让worker重新注册

         
worker.send(ReconnectWorker(masterUrl))
       
} else{
         
logWarning(s"Got heartbeat from unregistered worker $workerId."+
           
"This worker was never registered, so ignoring the heartbeat.")
       
}
    }
  }

 

 

到此,整个Worker和Master的通信就结束了。。。


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值