本文使用Scala语言实现了对Spark集群启动过程的模拟,一个Spark集群(非高可用)有一个主节点(Master),和N个从节点(Worker)组成。Spark(1.6版本以前)使用Akka实现集群的通信,在1.6版本后,使用Netty框架取代了Akka,本文的模拟仍是在Akka的框架下实现的。通过模拟可以方便对Spark集群启动过程的理解。
启动过程描述
- Master启动时,会开启定时任务,定期检查Worker列表。
- Worker启动的时候,Worker会根据配置文件中的Master信息(本文通过从控制台输入模拟这一过程),定位到Master,并向Master发送注册信息。
- Master接收到注册信息后,检查Worker列表,如果列表中不存在该Worker,就将该Worker的注册信息加入Worker列表。并向Worker发送注册成功信息。
- Worker接收到注册成功信息后,创建一个定时器,定时发送心跳数据。
- 如果Master在设置的时长内没有收到信条数据,则从Worker列表中将该Worker删除。表示该Worker已经下线。
代码实现
1 定义发送信息的case class
/**
* Worker注册信息
* @param workerID
* @param cpu
* @param memory
*/
case class Regist(val workerID:String,val cpu:Int,val memory:Int) extends Serializable
/**
* 注册成功的信息
* @param masterURL
*/
case class Registed(val masterURL:String) extends Serializable
/**
* 心跳数据
* @param workerID
*/
case class HeartBeat(val workerID:String) extends Serializable
/**
* Worker发送给自己的心跳数据
*/
case object HeartBeatToSelf
/**
*Master发送给自己的信息,用来定时检查Worker列表
*/
case object CheckTimeOut
2 定义一个封装了Worker信息的类,用在Master的Worker列表中,用来保存Worker信息
/**
* 用来保存Worker的信息
* @param workerID
* @param cpu
* @param memory
*/
class WorkerInfo(val workerID:String,val cpu:Int,val memory:Int) {
var lastHeartBeatTime:Long=_
}
3 Master
import akka.actor.{Actor, ActorSystem, Props}
import com.typesafe.config.ConfigFactory
import scala.collection.mutable
/**
* Master节点
* @param hostname Master主机名
* @param port Master用来通信的端口号
*/
class Master(hostname:String,port:Int) extends Actor{
//用来保存Worker列表
private val workerInfoMap: mutable.HashMap[String, WorkerInfo] = new mutable.HashMap[String,WorkerInfo]()
//用来保存Worker列表,定期检查Worker时,方便Worker的遍历
private val workerInfoSet:mutable.HashSet[WorkerInfo] = new mutable.HashSet[WorkerInfo]()
/**
* 创建定时器,定期向自己发送CheckTimeOut,定期检查存在的的Worker,15秒检查一次
*/
override def preStart(): Unit = {
import scala.concurrent.duration._
import context.dispatcher
context.system.scheduler.schedule(0 millis, 15000 millis, self, CheckTimeOut)
}
/**
* 接收到的信息进行相关处理
* @return
*/
override def receive: Receive = {
//处理注册信息
case Regist(uuid,cpu,memory)=>{
if(!workerInfoMap.contains(uuid)) {
val workerInfo: WorkerInfo = new WorkerInfo(uuid, cpu, memory)
workerInfoMap(uuid) = workerInfo
workerInfoSet += workerInfo
println("workerId为" + uuid + "的节点注册成功")
sender()!Registed(s"masterURL:http://${hostname}:${port}")
}
}
//处理心跳信息
case HeartBeat(workerID)=>{
val currentTime: Long = System.currentTimeMillis()
val info: WorkerInfo = workerInfoMap(workerID)
info.lastHeartBeatTime = currentTime
}
//检查存在的的Worker,更新Worker列表
case CheckTimeOut=>{
val currentTime: Long = System.currentTimeMillis()
val infoes: mutable.HashSet[WorkerInfo] = workerInfoSet.filter(t=>currentTime-t.lastHeartBeatTime>15000)
infoes.foreach(t=>{
workerInfoMap-=t.workerID
workerInfoSet-=t
})
println("成功注册的节点数:"+workerInfoSet.size)
}
}
}
object Master{
val MASTER_ACTORSYSTEM_NAME = "masterActorSystem"
val MASTER_ACTOR_NAME = "master"
def main(args: Array[String]): Unit = {
//控制台输入主机名,端口号
val hostname = args(0)
val port = args(1).toInt
//配置信息
val str =
s"""
|akka.actor.provider = "akka.remote.RemoteActorRefProvider"
|akka.remote.netty.tcp.hostname ="${hostname}"
|akka.remote.netty.tcp.port=${port}
""".stripMargin
val conf = ConfigFactory.parseString(str)
//创建actorSystem
val actorSystem: ActorSystem = ActorSystem(MASTER_ACTORSYSTEM_NAME,conf)
//创建actor
actorSystem.actorOf(Props(new Master(hostname,port)),MASTER_ACTOR_NAME)
}
}
4 Worker
import java.util.UUID
import akka.actor.{Actor, ActorSelection, ActorSystem, Props}
import com.typesafe.config.ConfigFactory
/**
* Worker节点
* @param masterName Master节点的主机名
* @param masterPort Master节点的端口号
* @param cpu Worker的cpu信息
* @param memory Woker的内存信息
*/
class Worker(masterName:String,masterPort:Int,cpu:Int,memory:Int) extends Actor{
//作为Worker的唯一标识,用来区分不同的Worker
private val uuid: String = UUID.randomUUID().toString
var masterRef:ActorSelection =_
/**
* 向Master发送注册信息
*/
override def preStart(): Unit = {
masterRef = context.actorSelection(s"akka.tcp://${Master.MASTER_ACTORSYSTEM_NAME}@${masterName}:${masterPort}/user/${Master.MASTER_ACTOR_NAME}")
masterRef ! Regist(uuid,cpu,memory)
}
/**
* 对收到的信息进行处理
* @return
*/
override def receive: Receive = {
//处理Master返回的注册成功信息
case Registed(masterURL)=>{
println(s"worker向master申请注册成功,收到返回信息${masterURL}")
import scala.concurrent.duration._
import context.dispatcher
//创建定时器,每三秒发送心跳数据,先发送给自己
context.system.scheduler.schedule(0 millis, 3000 millis, self, HeartBeatToSelf)
}
//向Master发送心跳数据
case HeartBeatToSelf=>{
masterRef ! HeartBeat(uuid)
}
}
}
object Worker{
val WORKER_ACTORSYSTEM_NAME = "workerActorSystem"
val WORKER_ACTOR_NAME = "worker"
def main(args: Array[String]): Unit = {
//启动时从控制台输入Master主机名,Master端口号等信息
val masterName = args(0)
val masterPort = args(1).toInt
val cpu = args(2).toInt
val memory = args(3).toInt
val port = args(4).toInt
//配置信息
val str=
s"""
|akka.actor.provider = "akka.remote.RemoteActorRefProvider"
|akka.remote.netty.tcp.hostname = "${masterName}"
|akka.remote.netty.tcp.port=${port}
""".stripMargin
val conf = ConfigFactory.parseString(str)
//创建actorSystem
val actorSystem = ActorSystem(WORKER_ACTORSYSTEM_NAME,conf)
//创建actor
actorSystem.actorOf(Props(new Worker(masterName,masterPort,cpu,memory)),WORKER_ACTOR_NAME)
}
}