Spark-SparkEnv 源码解析
SparkEnv Object
SparkEnv的伴生对象。
下面是他的构造方法:
SparkEnv Object属性
@volatile private var env: SparkEnv = _ //用户保存SparkEnv的实例对象
private[spark] val driverSystemName = "sparkDriver" //spark driver 的name
private[spark] val executorSystemName = "sparkExecutor" //spark execute 的name
SparkEnv Object方法
//set SparkEnv实例到 属性env
def set(e: SparkEnv) {
env = e
}
//get SparkEnv实例
def get: SparkEnv = {
env
}
//创建 Driver段的 SparkEnv
private[spark] def createDriverEnv(
conf: SparkConf,
isLocal: Boolean, //是不是local模式
listenerBus: LiveListenerBus,
numCores: Int,//根据spark-submit master的模式 获取driver段的核数
mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): //默认值
SparkEnv = {
//这里的DRIVER_HOST_ADDRESS,spark.driver.port在SparkContext 调用
//_env = createSparkEnv(_conf, isLocal, listenerBus) 前面已经设置过了,
//DRIVER_HOST_ADDRESS是真实的,spark.driver.port设置的则是端口0,由系统最后分配随机端口
assert(conf.contains(DRIVER_HOST_ADDRESS),
s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!")
assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")
val bindAddress = conf.get(DRIVER_BIND_ADDRESS) //dirver端host
val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)
val port = conf.get("spark.driver.port").toInt //0
val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {
Some(CryptoStreamUtils.createKey(conf))
} else {
None
}
create(
conf,
SparkContext.DRIVER_IDENTIFIER, //spark driver端的识别标示 ‘driver’
bindAddress, //dirver端host
advertiseAddress,
Option(port), //Option(0)
isLocal,//false
numCores, //driver端配置的核数
ioEncryptionKey, //None
listenerBus = listenerBus,
mockOutputCommitCoordinator = mockOutputCommitCoordinator //None
)
}
//创建execute端的SparkEnv
private[spark] def createExecutorEnv(
conf: SparkConf,
executorId: String,
hostname: String,
numCores: Int,
ioEncryptionKey: Option[Array[Byte]], //None
isLocal: Boolean //false if yarn-cluster
): SparkEnv = {
val env = create(
conf,
executorId,
hostname,
hostname,
None,
isLocal,
numCores,
ioEncryptionKey
)
SparkEnv.set(env)
env
}
create方法,比较重要的方法:
private def create(
conf: SparkConf,
executorId: String,//spark driver端的识别标示 ‘driver’
bindAddress: String,//dirver端host
advertiseAddress: String,//dirver端host
port: Option[Int],//driver的时候是 0
isLocal: Boolean,//yarn-cluster 模式 是false
numUsableCores: Int, //cpu核数
ioEncryptionKey: Option[Array[Byte]],//None
listenerBus: LiveListenerBus = null,//driver的话,不是 null
mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {//mockOutputCommitCoordinator None
val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER //driver的话是 true
// Listener bus is only used on the driver
if (isDriver) {
assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!")
}
val securityManager = new SecurityManager(conf, ioEncryptionKey)
if (isDriver) {
securityManager.initializeAuth()
}
ioEncryptionKey.foreach { _ =>
if (!securityManager.isEncryptionEnabled()) {
logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +
"wire.")
}
}
//sparkDriver or sparkExecutor
val systemName = if (isDriver) driverSystemName else executorSystemName
//创建RPC通信环境
//这里会有另外一篇博文 详细解读这个,这个方法执行完成之后,就会有真正的绑定的host和port
val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port.getOrElse(-1), conf,
securityManager, numUsableCores, !isDriver)
// Figure out which port RpcEnv actually bound to in case the original port is 0 or occupied.
if (isDriver) {
//拿到通信端口
conf.set("spark.driver.port", rpcEnv.address.port.toString)
}
//根据class name 反射 class
def instantiateClass[T](className: String): T = {
val cls = Utils.classForName(className)
// Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just
// SparkConf, then one taking no arguments
try {
cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE)
.newInstance(conf, new java.lang.Boolean(isDriver))
.asInstanceOf[T]
} catch {
case _: NoSuchMethodException =>
try {
cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]
} catch {
case _: NoSuchMethodException =>
cls.getConstructor().newInstance().asInstanceOf[T]
}
}
}
//首先获取配置,否则使用默认配置
def instantiateClassFromConf[T](propertyName: String, defaultClassName: String): T = {
instantiateClass[T](conf.get(propertyName, defaultClassName))
}
//实例化 序列化的类,默认是 org.apache.spark.serializer.JavaSerializer,
可以通过sparkConf设置 spark.serializer 属性
val serializer = instantiateClassFromConf[Serializer](
"spark.serializer", "org.apache.spark.serializer.JavaSerializer")
logDebug(s"Using serializer: ${serializer.getClass}")
//创建 序列化管理器
val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)
//使用java自己的 序列化 方式
val closureSerializer = new JavaSerializer(conf)
def registerOrLookupEndpoint(
name: String, endpointCreator: => RpcEndpoint):
RpcEndpointRef = {
if (isDriver) {
logInfo("Registering " + name)
rpcEnv.setupEndpoint(name, endpointCreator)
} else {
RpcUtils.makeDriverRef(name, conf, rpcEnv)
}
}
//创建广播管理器
val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)
//mapOutputTracker 有2中,一种是MapOutputTrackerMaster 当时driver,另外一种MapOutputTrackerMaster
val mapOutputTracker = if (isDriver) {
new MapOutputTrackerMaster(conf, broadcastManager, isLocal)
} else {
new MapOutputTrackerMaster(conf)
}
// Have to assign trackerEndpoint after initialization as MapOutputTrackerEndpoint
// requires the MapOutputTracker itself
mapOutputTracker.trackerEndpoint = registerOrLookupEndpoint(MapOutputTracker.ENDPOINT_NAME,
new MapOutputTrackerMasterEndpoint(
rpcEnv, mapOutputTracker.asInstanceOf[MapOutputTrackerMaster], conf))
// Let the user specify short names for shuffle managers
val shortShuffleMgrNames = Map(
"sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName,
"tungsten-sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName)
val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")
val shuffleMgrClass =
shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase(Locale.ROOT), shuffleMgrName)
//创建 shuffManager
val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
//是否使用 遗留的内存管理模式
val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
val memoryManager: MemoryManager =
if (useLegacyMemoryManager) {
new StaticMemoryManager(conf, numUsableCores)
} else {
UnifiedMemoryManager(conf, numUsableCores)
}
val blockManagerPort = if (isDriver) {
conf.get(DRIVER_BLOCK_MANAGER_PORT)
} else {
conf.get(BLOCK_MANAGER_PORT)
}
val blockTransferService =
new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress,
blockManagerPort, numUsableCores)
val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(
BlockManagerMaster.DRIVER_ENDPOINT_NAME,
new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)),
conf, isDriver)
// NB: blockManager is not valid until initialize() is called later.
val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,
serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager,
blockTransferService, securityManager, numUsableCores)
val metricsSystem = if (isDriver) {
// Don't start metrics system right now for Driver.
// We need to wait for the task scheduler to give us an app ID.
// Then we can start the metrics system.
MetricsSystem.createMetricsSystem("driver", conf, securityManager)
} else {
// We need to set the executor ID before the MetricsSystem is created because sources and
// sinks specified in the metrics configuration file will want to incorporate this executor's
// ID into the metrics they report.
conf.set("spark.executor.id", executorId)
val ms = MetricsSystem.createMetricsSystem("executor", conf, securityManager)
ms.start()
ms
}
val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse {
new OutputCommitCoordinator(conf, isDriver)
}
val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator",
new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))
outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef)
val envInstance = new SparkEnv(
executorId,
rpcEnv,
serializer,
closureSerializer,
serializerManager,
mapOutputTracker,
shuffleManager,
broadcastManager,
blockManager,
securityManager,
metricsSystem,
memoryManager,
outputCommitCoordinator,
conf)
// Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is
// called, and we only need to do it for driver. Because driver may run as a service, and if we
// don't delete this tmp dir when sc is stopped, then will create too many tmp dirs.
if (isDriver) {
val sparkFilesDir = Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath
envInstance.driverTmpDir = Some(sparkFilesDir)
}
envInstance
}
SparkEnv Class
这个类很简单,就是对:
serializer,
closureSerializer,
serializerManager,
mapOutputTracker,
shuffleManager,
broadcastManager,
blockManager,
securityManager,
metricsSystem,
memoryManager,
outputCommitCoordinator
以便在 SparkEnv stop的时候,统一安排顺序关闭上面的这些环境,还有就是配置driverTmpDir。