Spark-SparkEnv 源码解析

SparkEnv Object

SparkEnv的伴生对象。
下面是他的构造方法:

SparkEnv Object属性

@volatile private var env: SparkEnv = _ //用户保存SparkEnv的实例对象
private[spark] val driverSystemName = "sparkDriver" //spark driver 的name
private[spark] val executorSystemName = "sparkExecutor" //spark execute 的name

SparkEnv Object方法

//set SparkEnv实例到 属性env
def set(e: SparkEnv) {
    env = e
  }
//get SparkEnv实例
def get: SparkEnv = {
    env
  }

//创建 Driver段的 SparkEnv
private[spark] def createDriverEnv(
      conf: SparkConf,
      isLocal: Boolean, //是不是local模式
      listenerBus: LiveListenerBus,
      numCores: Int,//根据spark-submit master的模式 获取driver段的核数
      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): //默认值 
      SparkEnv = {
      //这里的DRIVER_HOST_ADDRESS,spark.driver.port在SparkContext 调用
      //_env = createSparkEnv(_conf, isLocal, listenerBus) 前面已经设置过了,
      //DRIVER_HOST_ADDRESS是真实的,spark.driver.port设置的则是端口0,由系统最后分配随机端口
    assert(conf.contains(DRIVER_HOST_ADDRESS),
      s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!")
    assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!")
    val bindAddress = conf.get(DRIVER_BIND_ADDRESS) //dirver端host
    val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS)
    val port = conf.get("spark.driver.port").toInt //0
    val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) {
      Some(CryptoStreamUtils.createKey(conf))
    } else {
      None
    }
    create(
      conf,
      SparkContext.DRIVER_IDENTIFIER, //spark driver端的识别标示 ‘driver’
      bindAddress, //dirver端host
      advertiseAddress, 
      Option(port), //Option(0)
      isLocal,//false
      numCores, //driver端配置的核数
      ioEncryptionKey, //None
      listenerBus = listenerBus,
      mockOutputCommitCoordinator = mockOutputCommitCoordinator //None
    )
  }
//创建execute端的SparkEnv
private[spark] def createExecutorEnv(
      conf: SparkConf,
      executorId: String,
      hostname: String,
      numCores: Int,
      ioEncryptionKey: Option[Array[Byte]], //None
      isLocal: Boolean //false if yarn-cluster
      ): SparkEnv = {
    val env = create(
      conf,
      executorId,
      hostname,
      hostname,
      None,
      isLocal,
      numCores,
      ioEncryptionKey
    )
    SparkEnv.set(env)
    env
  }

create方法,比较重要的方法:

private def create(
      conf: SparkConf,
      executorId: String,//spark driver端的识别标示 ‘driver’
      bindAddress: String,//dirver端host
      advertiseAddress: String,//dirver端host
      port: Option[Int],//driver的时候是 0
      isLocal: Boolean,//yarn-cluster 模式 是false
      numUsableCores: Int, //cpu核数
      ioEncryptionKey: Option[Array[Byte]],//None
      listenerBus: LiveListenerBus = null,//driver的话,不是 null
      mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = {//mockOutputCommitCoordinator None

    val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER //driver的话是 true

    // Listener bus is only used on the driver
    if (isDriver) {
      assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!")
    }

    val securityManager = new SecurityManager(conf, ioEncryptionKey)
    if (isDriver) {
      securityManager.initializeAuth()
    }

    ioEncryptionKey.foreach { _ =>
      if (!securityManager.isEncryptionEnabled()) {
        logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +
          "wire.")
      }
    }
	//sparkDriver or sparkExecutor
    val systemName = if (isDriver) driverSystemName else executorSystemName
    //创建RPC通信环境
    //这里会有另外一篇博文 详细解读这个,这个方法执行完成之后,就会有真正的绑定的host和port
    val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port.getOrElse(-1), conf,
      securityManager, numUsableCores, !isDriver)

    // Figure out which port RpcEnv actually bound to in case the original port is 0 or occupied.
    if (isDriver) {
      //拿到通信端口
      conf.set("spark.driver.port", rpcEnv.address.port.toString)
    }
    //根据class name 反射 class
    def instantiateClass[T](className: String): T = {
      val cls = Utils.classForName(className)
      // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just
      // SparkConf, then one taking no arguments
      try {
        cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE)
          .newInstance(conf, new java.lang.Boolean(isDriver))
          .asInstanceOf[T]
      } catch {
        case _: NoSuchMethodException =>
          try {
            cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]
          } catch {
            case _: NoSuchMethodException =>
              cls.getConstructor().newInstance().asInstanceOf[T]
          }
      }
    }
	//首先获取配置,否则使用默认配置
    def instantiateClassFromConf[T](propertyName: String, defaultClassName: String): T = {
      instantiateClass[T](conf.get(propertyName, defaultClassName))
    }
	//实例化 序列化的类,默认是 org.apache.spark.serializer.JavaSerializer,
	可以通过sparkConf设置 spark.serializer 属性
    val serializer = instantiateClassFromConf[Serializer](
      "spark.serializer", "org.apache.spark.serializer.JavaSerializer")
    logDebug(s"Using serializer: ${serializer.getClass}")
    //创建 序列化管理器
    val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey)
    //使用java自己的 序列化 方式
    val closureSerializer = new JavaSerializer(conf)

    def registerOrLookupEndpoint(
        name: String, endpointCreator: => RpcEndpoint):
      RpcEndpointRef = {
      if (isDriver) {
        logInfo("Registering " + name)
        rpcEnv.setupEndpoint(name, endpointCreator)
      } else {
        RpcUtils.makeDriverRef(name, conf, rpcEnv)
      }
    }
    //创建广播管理器
    val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)
    //mapOutputTracker 有2中,一种是MapOutputTrackerMaster 当时driver,另外一种MapOutputTrackerMaster
    val mapOutputTracker = if (isDriver) {
      new MapOutputTrackerMaster(conf, broadcastManager, isLocal)
    } else {
      new MapOutputTrackerMaster(conf)
    }

    // Have to assign trackerEndpoint after initialization as MapOutputTrackerEndpoint
    // requires the MapOutputTracker itself
    mapOutputTracker.trackerEndpoint = registerOrLookupEndpoint(MapOutputTracker.ENDPOINT_NAME,
      new MapOutputTrackerMasterEndpoint(
        rpcEnv, mapOutputTracker.asInstanceOf[MapOutputTrackerMaster], conf))

    // Let the user specify short names for shuffle managers
    val shortShuffleMgrNames = Map(
      "sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName,
      "tungsten-sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName)
    val shuffleMgrName = conf.get("spark.shuffle.manager", "sort")
    val shuffleMgrClass =
      shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase(Locale.ROOT), shuffleMgrName)
    //创建 shuffManager
    val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
    //是否使用 遗留的内存管理模式
    val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
    val memoryManager: MemoryManager =
      if (useLegacyMemoryManager) {
        new StaticMemoryManager(conf, numUsableCores)
      } else {
        UnifiedMemoryManager(conf, numUsableCores)
      }

    val blockManagerPort = if (isDriver) {
      conf.get(DRIVER_BLOCK_MANAGER_PORT)
    } else {
      conf.get(BLOCK_MANAGER_PORT)
    }

    val blockTransferService =
      new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress,
        blockManagerPort, numUsableCores)

    val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint(
      BlockManagerMaster.DRIVER_ENDPOINT_NAME,
      new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)),
      conf, isDriver)

    // NB: blockManager is not valid until initialize() is called later.
    val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,
      serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager,
      blockTransferService, securityManager, numUsableCores)

    val metricsSystem = if (isDriver) {
      // Don't start metrics system right now for Driver.
      // We need to wait for the task scheduler to give us an app ID.
      // Then we can start the metrics system.
      MetricsSystem.createMetricsSystem("driver", conf, securityManager)
    } else {
      // We need to set the executor ID before the MetricsSystem is created because sources and
      // sinks specified in the metrics configuration file will want to incorporate this executor's
      // ID into the metrics they report.
      conf.set("spark.executor.id", executorId)
      val ms = MetricsSystem.createMetricsSystem("executor", conf, securityManager)
      ms.start()
      ms
    }

    val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse {
      new OutputCommitCoordinator(conf, isDriver)
    }
    val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator",
      new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))
    outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef)

    val envInstance = new SparkEnv(
      executorId,
      rpcEnv,
      serializer,
      closureSerializer,
      serializerManager,
      mapOutputTracker,
      shuffleManager,
      broadcastManager,
      blockManager,
      securityManager,
      metricsSystem,
      memoryManager,
      outputCommitCoordinator,
      conf)

    // Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is
    // called, and we only need to do it for driver. Because driver may run as a service, and if we
    // don't delete this tmp dir when sc is stopped, then will create too many tmp dirs.
    if (isDriver) {
      val sparkFilesDir = Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath
      envInstance.driverTmpDir = Some(sparkFilesDir)
    }

    envInstance
  }

SparkEnv Class

这个类很简单,就是对:
serializer,
closureSerializer,
serializerManager,
mapOutputTracker,
shuffleManager,
broadcastManager,
blockManager,
securityManager,
metricsSystem,
memoryManager,
outputCommitCoordinator
以便在 SparkEnv stop的时候,统一安排顺序关闭上面的这些环境,还有就是配置driverTmpDir。

Spark-submit是Spark自带的提交脚本,用于将Spark应用程序提交到集群中运行。Spark-submit可以将应用程序打包成一个JAR包并提交到集群中运行,也可以直接提交一个Python文件或者一个Scala文件。 使用Spark-submit提交应用程序时,需要指定以下参数: 1. --class:指定主类名,如果是Java应用程序,需要指定该参数;如果是Scala应用程序,可以省略该参数,Spark-submit会自动查找Scala文件中的main函数。 2. --master:指定运行模式,可以是local、yarn、mesos等。 3. --deploy-mode:指定部署模式,可以是client或者cluster,如果是client模式,则Driver运行在提交任务的机器上;如果是cluster模式,则Driver运行在集群中的某个节点上。 4. --executor-memory:指定Executor的内存大小。 5. --total-executor-cores:指定Executor的总核数。 6. --num-executors:指定Executor的个数。 7. 应用程序的JAR包路径或者Python/Scala文件路径。 例如,使用Spark-submit提交一个Java应用程序,命令如下: ``` ./bin/spark-submit --class com.spark.example.WordCount --master yarn --deploy-mode client --executor-memory 2g --total-executor-cores 4 --num-executors 2 /path/to/WordCount.jar /path/to/input /path/to/output ``` 其中,--class指定了Java应用程序的主类名为com.spark.example.WordCount,--master指定了运行模式为yarn,--deploy-mode指定了部署模式为client,--executor-memory指定了每个Executor的内存大小为2g,--total-executor-cores指定了Executor总核数为4,--num-executors指定了Executor的个数为2,最后两个参数为输入和输出路径。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值