承接上文,回到Master的onStart的方法体中
// masterWebUiUrl 值 http://luyl152:8080 masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort //启动定时器,定时发送信息,检查worker的状态 checkForWorkerTimeOutTask = forwardMessageThread.scheduleAtFixedRate(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { //信息会发给自已,即master,因为是send,不需要返回信息,所以会被master的receiver方法接收到,
调用了timeOutDeadWorkers,表示将timeOut的Worker信息从HashSet[WorkerInfo]中去掉 self.send(CheckForWorkerTimeOut) } }, 0, WORKER_TIMEOUT_MS, TimeUnit.MILLISECONDS) //1分钟之后发送
//receive方法接收EndpointRef send方法发送的信息
override def receive: PartialFunction[Any, Unit] ={
……
//启动定时器,定时发送信息,检查worker的状态,默认是60秒检测一下
case CheckForWorkerTimeOut=> {
timeOutDeadWorkers()
}
==>就是将超时的worker去掉
/** Check for,and remove, any timed-out workers */
private def timeOutDeadWorkers() {
// Copy the workers into an array so we don't modify thehashset while iterating through it
//清晰的看出Master根据条件lastHeartbeat < currentTime - WORKER_TIMEOUT_MS(1分钟)判断Worker是否还在发送心跳, 如果过期将其从对应集合中删除。lastHeartbeat是Worker最后一次连接的时间
val currentTime= System.currentTimeMillis()
val toRemove= workers.filter(_.lastHeartbeat <currentTime - WORKER_TIMEOUT_MS).toArray
for (worker<- toRemove) {
if (worker.state != WorkerState.DEAD) {
logWarning("Removing %s because we got no heartbeat in %dseconds".format(
worker.id, WORKER_TIMEOUT_MS/ 1000))
removeWorker(worker)
} else {
if (worker.lastHeartbeat < currentTime - ((REAPER_ITERATIONS + 1) * WORKER_TIMEOUT_MS)) {
workers -= worker // we've seen thisDEAD worker in the UI, etc. for long enough; cull it
}
}
}
}
===>这个RestServer主要提供给Cluster模式使用,后面再分析
//启动一个Rest Server,默认是true if (restServerEnabled) { val port = conf.getInt("spark.master.rest.port", 6066) //address.host就是当前master的ip restServer = Some(new StandaloneRestServer(address.host, port, conf, self, masterUrl)) } //实际上还是启动一个jettyserver对应servlet restServerBoundPort = restServer.map(_.start()) //会收集spark组件的状态 masterMetricsSystem.registerSource(masterSource) masterMetricsSystem.start() applicationMetricsSystem.start() // Attach the master and app metrics servlet handler to the web ui after the metrics systems are // started. masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler) val serializer = new JavaSerializer(conf)
//在HA的模式下面RECOVERY_MODE是对应ZOOKEEPER,将zk的持久化引擎和选举代理提取出来
//查看“spark-core_10: org.apache.spark.deploy.master.Master源码解析2” val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match { case "ZOOKEEPER" => logInfo("Persisting recovery state to ZooKeeper") val zkFactory = new ZooKeeperRecoveryModeFactory(conf, serializer) //(ZooKeeperPersistenceEngine ,ZooKeeperLeaderElectionAgent) (zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this)) case "FILESYSTEM" => val fsFactory = new FileSystemRecoveryModeFactory(conf, serializer) (fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this)) case "CUSTOM" => val clazz = Utils.classForName(conf.get("spark.deploy.recoveryMode.factory")) val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serializer]) .newInstance(conf, serializer) .asInstanceOf[StandaloneRecoveryModeFactory] (factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this)) case _ => (new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this)) } persistenceEngine = persistenceEngine_ leaderElectionAgent = leaderElectionAgent_ }
====》初始化出来的ZooKeeperLeaderElectionAgent,会调用ha选举机制
(zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this)
===》进入createLeaderElectionAgent(this)代码
private[master] class ZooKeeperRecoveryModeFactory(conf:SparkConf, serializer: Serializer)
extends StandaloneRecoveryModeFactory(conf, serializer){
// PersistenceEngine定义了如何处理持久性数据(有关worker,driver等的信息)以进行恢复。
def createPersistenceEngine(): PersistenceEngine = {
new ZooKeeperPersistenceEngine(conf, serializer)
}
//创建一个LeaderAgent的实例,决定谁当选为Master。
def createLeaderElectionAgent(master: LeaderElectable):LeaderElectionAgent = {
new ZooKeeperLeaderElectionAgent(master, conf)
}
}
===>当前的Master本身就是一个LeaderElectable子类
private[deploy] class Master(
。。。) extends ThreadSafeRpcEndpoint with Loggingwith LeaderElectable {
====》查看一下ZookeeperLeaderElectionAgent是如何触发选举Leader和StandBy的
private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable,
conf: SparkConf) extends LeaderLatchListener withLeaderElectionAgent with Logging {
val WORKING_DIR= conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election"
。。。
start()
private def start() {
logInfo("Starting ZooKeeper LeaderElection agent")
zk =SparkCuratorUtil.newClient(conf)
//跟踪LeaderLatch源码,可以看一下是如何调用的Leader选举的
leaderLatch = new LeaderLatch(zk, WORKING_DIR)
leaderLatch.addListener(this)
leaderLatch.start()
}
====>在LeaderLatch源码中可以看到这一段代码input.isLeader(),这个input就是ZookeeperLeaderElectionAgent
privatesynchronized void setLeadership(boolean newValue){
boolean oldValue= hasLeadership.getAndSet(newValue);
if ( oldValue && !newValue ) { // Lost leadership, was true, now false
listeners.forEach(new Function<LeaderLatchListener, Void>(){
@Override
public Voidapply(LeaderLatchListener listener)
{
listener.notLeader();
return null;
}
});
}elseif ( !oldValue && newValue ) { // Gained leadership, was false, now true
listeners.forEach(new Function<LeaderLatchListener, Void>(){
@Override
public Voidapply(LeaderLatchListener input) {
//listeners里面的ZookeeperLeaderElectionAgent,就是上面leaderLatch.addListener(this)
input.isLeader();
return null;
}
});
}
notifyAll();
}
===>所以可以看出isLeader()被调用了
/**
* 当LeaderLatch的状态从hasLeadership= false变为hasLeadership = true时,这被调用。
请注意,在这种方法调用发生的时候,有可能导致领导层失败。如果发生这种情况,您也可以期待{@link #notLeader()}被调用。 是由LeaderLatch触发的
*/
override def isLeader() {
synchronized {
// could have lost leadership by now.
if (!leaderLatch.hasLeadership) {
return
}
logInfo("We have gained leadership")
updateLeadershipStatus(true)
}
}
。。
//该方法是由isLeader和notLeader方法调用进来的
private def updateLeadershipStatus(isLeader:Boolean) {
if (isLeader&& status == LeadershipStatus.NOT_LEADER) {
status =LeadershipStatus.LEADER
//肯定是这一段代码被调用,这个masterInstance就是Master实例
masterInstance.electedLeader()
} else if (!isLeader && status== LeadershipStatus.LEADER) {
status =LeadershipStatus.NOT_LEADER
masterInstance.revokedLeadership()
}
}
…..
}
===》因此又回到Master.electedLeader()代码上
//ZooKeeperLeaderElectionAgent的 updateLeadershipStatus调用的
override def electedLeader() {
self.send(ElectedLeader)
}
===》给Master自己发了一个ElectedLeader这个case class,所以找到MasterRpcEndPoint的receive偏函数
//receive方法接收EndpointRef send方法发送的信息
override def receive: PartialFunction[Any, Unit] ={
//Leader选举、recovery和各种状态查询
case ElectedLeader=> {
val (storedApps, storedDrivers, storedWorkers) = persistenceEngine.readPersistedData(rpcEnv)
state = if (storedApps.isEmpty && storedDrivers.isEmpty&& storedWorkers.isEmpty) {
RecoveryState.ALIVE
} else{
RecoveryState.RECOVERING
}
logInfo("I have been elected leader! New state: " + state)
if (state == RecoveryState.RECOVERING) {
beginRecovery(storedApps, storedDrivers, storedWorkers)
recoveryCompletionTask = forwardMessageThread.schedule(new Runnable{
override def run(): Unit = Utils.tryLogNonFatalError {
self.send(CompleteRecovery)
}
}, WORKER_TIMEOUT_MS, TimeUnit.MILLISECONDS)
}
}
到此,整个master初始化结束