八、worker原理剖析

一 worker启动driver

1 master向worker发送启动driver消息 worker.endpoint.send(LaunchDriver(driver.id, driver.desc)),见第七节。

2 worker收到消息后,进入LaunchDriver流程。

3 创建DriverRunner实例并调用其start方法。

4 DriverRunner start方法中,创建一个线程,线程内部做如下操作:

   4.1 创建driver工作目录。

   4.2 将jar包下载到driver工作目录中。

   4.3 封装启动driver的命令,用ProcessBuilder启动driver。

   4.4 向当前worker发送driver状态改变消息。

5 更改woker的cpu和内存使用情况。

    // 2 worker收到消息后,调用LaunchDriver方法
    case LaunchDriver(driverId, driverDesc) =>
      logInfo(s"Asked to launch driver $driverId")
      // 3 创建DriverRunner实例并调用其start方法
      val driver = new DriverRunner(
        conf,
        driverId,
        workDir,
        sparkHome,
        driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)),
        self,
        workerUri,
        securityMgr)
      drivers(driverId) = driver
      driver.start()

      // 5 更改woker的cpu和内存使用情况
      coresUsed += driverDesc.cores
      memoryUsed += driverDesc.mem


  /** Starts a thread to run and manage the driver. */
  private[worker] def start() = {
    // 4 DriverRunner start方法中,创建一个线程
    new Thread("DriverRunner for " + driverId) {
      override def run() {
        var shutdownHook: AnyRef = null
        try {
          shutdownHook = ShutdownHookManager.addShutdownHook { () =>
            logInfo(s"Worker shutting down, killing driver $driverId")
            kill()
          }

          // prepare driver jars and run driver
          val exitCode = prepareAndRunDriver()

          // set final state depending on if forcibly killed and process exit code
          finalState = if (exitCode == 0) {
            Some(DriverState.FINISHED)
          } else if (killed) {
            Some(DriverState.KILLED)
          } else {
            Some(DriverState.FAILED)
          }
        } catch {
          case e: Exception =>
            kill()
            finalState = Some(DriverState.ERROR)
            finalException = Some(e)
        } finally {
          if (shutdownHook != null) {
            ShutdownHookManager.removeShutdownHook(shutdownHook)
          }
        }

        // notify worker of final driver state, possible exception
        worker.send(DriverStateChanged(driverId, finalState.get, finalException)) // 4.4 向当前worker发送driver状态改变消息
      }
    }.start()
  }

    private[worker] def prepareAndRunDriver(): Int = {
    val driverDir = createWorkingDirectory() // 4.1 创建driver工作目录
    val localJarFilename = downloadUserJar(driverDir) // 4.2 将jar包下载到driver工作目录中

    def substituteVariables(argument: String): String = argument match {
      case "{{WORKER_URL}}" => workerUrl
      case "{{USER_JAR}}" => localJarFilename
      case other => other
    }

    // TODO: If we add ability to submit multiple jars they should also be added here
    val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager,
      driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables) // 4.3 封装启动driver的命令

    runDriver(builder, driverDir, driverDesc.supervise) // 4.3 用ProcessBuilder启动driver
  }


    private def runDriver(builder: ProcessBuilder, baseDir: File, supervise: Boolean): Int = {
    builder.directory(baseDir)
    def initialize(process: Process): Unit = {
      // Redirect stdout and stderr to files
      val stdout = new File(baseDir, "stdout")
      CommandUtils.redirectStream(process.getInputStream, stdout)

      val stderr = new File(baseDir, "stderr")
      val formattedCommand = builder.command.asScala.mkString("\"", "\" \"", "\"")
      val header = "Launch Command: %s\n%s\n\n".format(formattedCommand, "=" * 40)
      Files.append(header, stderr, StandardCharsets.UTF_8)
      CommandUtils.redirectStream(process.getErrorStream, stderr)
    }
    runCommandWithRetry(ProcessBuilderLike(builder), initialize, supervise) // 4.3 用ProcessBuilder启动driver
  }

二 worker启动executor

1 master向worker发送启动executor消息 worker.endpoint.send(LaunchExecutor(masterUrl,exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory)),见第七节。

2 worker收到消息后,进入LaunchExecutor流程。

3 创建executor的工作目录。

4 创建ExecutorRunner实例并调用其start方法。

5 ExecutorRunner start方法中,创建一个线程,线程内部做如下操作:

   5.1 封装启动executor的命令,用ProcessBuilder启动executor。

   5.2 向当前worker发送executor状态改变消息。

6 向master发送ExecutorStateChanged消息。

    // 2 worker收到消息后,进入LaunchExecutor流程
    case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
      if (masterUrl != activeMasterUrl) {
        logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
      } else {
        try {
          logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))

          // Create the executor's working directory
          // 3 创建executor的工作目录。
          val executorDir = new File(workDir, appId + "/" + execId)
          if (!executorDir.mkdirs()) {
            throw new IOException("Failed to create directory " + executorDir)
          }

          // Create local dirs for the executor. These are passed to the executor via the
          // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
          // application finishes.
          val appLocalDirs = appDirectories.getOrElse(appId, {
            val localRootDirs = Utils.getOrCreateLocalRootDirs(conf)
            val dirs = localRootDirs.flatMap { dir =>
              try {
                val appDir = Utils.createDirectory(dir, namePrefix = "executor")
                Utils.chmod700(appDir)
                Some(appDir.getAbsolutePath())
              } catch {
                case e: IOException =>
                  logWarning(s"${e.getMessage}. Ignoring this directory.")
                  None
              }
            }.toSeq
            if (dirs.isEmpty) {
              throw new IOException("No subfolder can be created in " +
                s"${localRootDirs.mkString(",")}.")
            }
            dirs
          })
          appDirectories(appId) = appLocalDirs
          // 4 创建ExecutorRunner实例并调用其start方法
          val manager = new ExecutorRunner(
            appId,
            execId,
            appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
            cores_,
            memory_,
            self,
            workerId,
            host,
            webUi.boundPort,
            publicAddress,
            sparkHome,
            executorDir,
            workerUri,
            conf,
            appLocalDirs, ExecutorState.RUNNING)
          executors(appId + "/" + execId) = manager
          manager.start()
          coresUsed += cores_
          memoryUsed += memory_
          sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None)) // 6 向master发送ExecutorStateChanged消息
        } catch {
          case e: Exception =>
            logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
            if (executors.contains(appId + "/" + execId)) {
              executors(appId + "/" + execId).kill()
              executors -= appId + "/" + execId
            }
            sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
              Some(e.toString), None))
        }
      }


  private[worker] def start() {
    // 5 ExecutorRunner start方法中,创建一个线程
    workerThread = new Thread("ExecutorRunner for " + fullId) {
      override def run() { fetchAndRunExecutor() // 5.1 封装启动executor的命令,用ProcessBuilder启动executor }
    }
    workerThread.start()
    // Shutdown hook that kills actors on shutdown.
    shutdownHook = ShutdownHookManager.addShutdownHook { () =>
      // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
      // be `ExecutorState.RUNNING`. In this case, we should set `state` to `FAILED`.
      if (state == ExecutorState.RUNNING) {
        state = ExecutorState.FAILED
      }
      killProcess(Some("Worker shutting down")) }
  }

    /**
   * Download and run the executor described in our ApplicationDescription
   */
  private def fetchAndRunExecutor() {
    try {
      // Launch the process
      val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),
        memory, sparkHome.getAbsolutePath, substituteVariables)
      val command = builder.command()
      val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
      logInfo(s"Launch command: $formattedCommand")

      builder.directory(executorDir)
      builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
      // In case we are running this from within the Spark Shell, avoid creating a "scala"
      // parent process for the executor command
      builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

      // Add webUI log urls
      val baseUrl =
        if (conf.getBoolean("spark.ui.reverseProxy", false)) {
          s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
        } else {
          s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
        }
      builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
      builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")

      process = builder.start()
      val header = "Spark Executor Command: %s\n%s\n\n".format(
        formattedCommand, "=" * 40)

      // Redirect its stdout and stderr to files
      val stdout = new File(executorDir, "stdout")
      stdoutAppender = FileAppender(process.getInputStream, stdout, conf)

      val stderr = new File(executorDir, "stderr")
      Files.write(header, stderr, StandardCharsets.UTF_8)
      stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

      // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
      // or with nonzero exit code
      val exitCode = process.waitFor()
      state = ExecutorState.EXITED
      val message = "Command exited with code " + exitCode
      worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))) // 5.2 向当前worker发送executor状态改变消息
    } catch {
      case interrupted: InterruptedException =>
        logInfo("Runner thread for executor " + fullId + " interrupted")
        state = ExecutorState.KILLED
        killProcess(None)
      case e: Exception =>
        logError("Error running executor", e)
        state = ExecutorState.FAILED
        killProcess(Some(e.toString))
    }
  }

 

如下内容参考了:

https://www.cnblogs.com/jcchoiling/p/6436185.html

https://blog.csdn.net/chic_data/article/details/77317730

executor启动的进程叫CoarseGrainedExecutorBackend,其信息在app desc中。

  • 第一步:Master 发指令给 Worker 启动 Executor;
  • 第二步:Worker 接收到 Master 发送过来的指令通过 ExecutorRunner 远程启动另外一个线程来运行 Executor;
  • 第三步:通过发送 RegisterExecutor 向 Driver 注册 Executor,这个时侯Worker 会启动另外一个进程来向 Driver 发送注册的信息,思考题:为什么要多开一个新进程而不在原有的 Worker 进程里发送信息给 Driver 呢?因为Worker 主要是管理当前机器上的资源的,而当前机器上的资源有变动的时候需要汇报给 Master,Worker 不是用来计算的,所以不可以在 Worker 里做计算;而且,在 Spark 中可能有很多不同的的应用程序,有很多应用程序你就需要有很多 Executor,如果你不是为每个 Executor 启动一个进程的话,这会导致当一个程序崩溃时,其他程序也会崩溃。

     

1.在CoarseGrainedExecutorBackend启动时,向Driver注册Executor其实质是注册ExecutorBackend实例,和Executor实例之间没有直接的关系!!!

  override def onStart() {
    logInfo("Connecting to driver: " + driverUrl)
    rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
      // This is a very fast action so we can use "ThreadUtils.sameThread"
      // 获取到driver
      driver = Some(ref)
      // 向driver发送RegisterExecutor消息
      ref.ask[Boolean](RegisterExecutor(executorId, self, hostname, cores, extractLogUrls))
    }(ThreadUtils.sameThread).onComplete {
      // This is a very fast action so we can use "ThreadUtils.sameThread"
      case Success(msg) =>
        // Always receive `true`. Just ignore it
      case Failure(e) =>
        exitExecutor(1, s"Cannot register with driver: $driverUrl", e, notifyDriver = false)
    }(ThreadUtils.sameThread)
  }

2.CoarseGrainedExecutorBackend是Executor运行所在的进程名称,Executor才是真正在处理Task的对象,Executor内部是通过线程池的方式来完成Task的计算的。

  // Executor类
  def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = {
    val tr = new TaskRunner(context, taskDescription) // 对于每一个task,都会实现一个TaskRunner
    runningTasks.put(taskDescription.taskId, tr) // 放入缓存
    threadPool.execute(tr) // 放入线程池去调度执行
  }

3. CoarseGrainedExecutorBackend和Executor是一一对应的。

4. CoarseGrainedExecutorBackend是一个消息通信体(其实现了ThreadSafeRpcEndpoint)。可以发送信息给Driver,并可以接收Driver中发过来的指令,例如启动Task等。

  override def receive: PartialFunction[Any, Unit] = {
    // 向driver注册消息成功,则driver会返回RegisteredExecutor消息
    // 创建Executor对象,作为执行句柄
    case RegisteredExecutor =>
      logInfo("Successfully registered with driver")
      try {
        executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
      } catch {
        case NonFatal(e) =>
          exitExecutor(1, "Unable to create executor due to " + e.getMessage, e)
      }

    ...

    // 启动task
    case LaunchTask(data) =>
      if (executor == null) {
        exitExecutor(1, "Received LaunchTask command but executor was null")
      } else {
        val taskDesc = TaskDescription.decode(data.value)
        logInfo("Got assigned task " + taskDesc.taskId)
        executor.launchTask(this, taskDesc) // 调用Executor中的launchTask,最终在线程池中执行
      }

   ...
  }

5.在Driver进程中,有两个至关重要的Endpoint,

    a)第一个就是ClientEndpoint,主要负责向Master注册当前的程序;是AppClient的内部成员。

    b)另外一个就是DriverEndpoint,这是整个程序运行时候的驱动器!!是CoarseGrainedExecutorBackend的内部成员。

6.在Driver中通过ExecutorData封装并注册ExecutorBackend的信息到Driver的内存数据结构ExecutorMapData中。ExecutorMapData是CoarseGrainedSchedulerBackend的成员。最终是注册给CoarseGrainedSchedulerBackend。

7.实际在执行的时候,DriverEndpoint会把信息写入CoarseGrainedSchedulerBackend的内存数据结构ExecutorMapData中,所以说最终是注册给CoarseGrainedSchedulerBackend,也就是说CoarseGrainedSchedulerBackend掌握了为当前程序分配的所有的ExucutorBackend进程,而在每一个ExecutorBackend进程实例中会通过Executor对象来负责具体Task的运行。在欲行的时候使用syschronized关键字来保证ExecutorMapData安全的并发写操作。

8.CoarseGrainedExecutorBackend收到DriverEndpoint发送过来的RegisteredExecutor消息后会启动Executor实例对象,而Executor实例对象是事实上负责真正Task计算的。
 

Executor是如何工作的?

1.当Driver发送过来Task的时候,其实是发送给了CoarseGrainedExecutorBackend这个RpcEndpoint,而不是直接发送给了Executor(Executor由于不是消息循环体,所以永远无法直接接收远程发送过来的信息)。

case LaunchTask(data) =>
  if (executor == null) {
    logError("Received LaunchTask command but executor was null")
    System.exit(1)
  } else {
    val taskDesc = ser.deserialize[TaskDescription](data.value)
    logInfo("Got assigned task " + taskDesc.taskId)
    executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber,
      taskDesc.name, taskDesc.serializedTask)
  }

 

2.ExecutorBackend在收到Driver中发送过来的消息后,会通过调用launchTask来交给Executor去执行。

case LaunchTask(data) =>
  if (executor == null) {
    logError("Received LaunchTask command but executor was null")
    System.exit(1)
  } else {
    val taskDesc = ser.deserialize[TaskDescription](data.value)
    logInfo("Got assigned task " + taskDesc.taskId)
    executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber,
      taskDesc.name, taskDesc.serializedTask)
  }

  

在介绍CoarseGrainedExecutorBackend的启动流程前,先了解下CoarseGrainedExecutorBackend。我们知道Executor负责计算任务,即执行task,而Executor对象的创建及维护是由CoarseGrainedExecutorBackend负责的,CoarseGrainedExecutorBackend在Spark运行期是一个单独的进程,在Worker节点可以通过Java的jps命令查看,如下 
 
CoarseGrainedExecutorBackend是RpcEndpoint的子类,能够和Driver进行RPC通信,其生命周期方法onStart一定要关注,看执行了哪些动作。
CoarseGrainedExecutorBackend维护了两个属性executor和driver,executor负责运行task,driver负责和Driver通信。
ExecutorBackend有抽象方法statusUpdate,负责将Executor的计算结果返回给Driver。
最后,CoarseGrainedExecutorBackend是spark运行期的一个进程,Executor运行在该进程内。
 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
高并发分布式系统中生成全局唯一Id汇总 数据在分片时,典型的是分库分表,就有一个全局ID生成的问题。 单纯的生成全局ID并不是什么难题,但是生成的ID通常要满足分片的一些要求: 1 不能有单点故障。 2 以时间为序,或者ID里包含时间。这样一是可以少一个索引,二是冷热数据容易分离。 3 可以控制ShardingId。比如某一个用户的文章要放在同一个分片内,这样查询效率高,修改也容易。 4 不要太长,最好64bit。使用long比较好操作,如果是96bit,那就要各种移位相当的不方便,还有可能有些组件不能支持这么大的ID。 一 twitter twitter在把存储系统从MySQL迁移到Cassandra的过程中由于Cassandra没有顺序ID生成机制,于是自己开发了一套全局唯一ID生成服务:Snowflake。 1 41位的时间序列(精确到毫秒,41位的长度可以使用69年) 2 10位的机器标识(10位的长度最多支持部署1024个节点) 3 12位的计数顺序号(12位的计数顺序号支持每个节点每毫秒产生4096个ID序号) 最高位是符号位,始终为0。 优点:高性能,低延迟;独立的应用;按时间有序。 缺点:需要独立的开发和部署。 原理 java 实现代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 public class IdWorker { private final long workerId; private final static long twepoch = 1288834974657L; private long sequence = 0L; private final static long workerIdBits = 4L; public final static long maxWorkerId = -1L ^ -1L << workerIdBits; private final static long sequenceBits = 10L; private final static long workerIdShift = sequenceBits; private final static long timestampLeftShift = sequenceBits + workerIdBits; public final static long sequenceMask = -1L ^ -1L < this.maxWorkerId || workerId < 0) { throw new IllegalArgumentException(String.format( "worker Id can't be greater than %d or less than 0", this.maxWorkerId)); } this.workerId = workerId; } public synchronized long nextId() { long timestamp = this.timeGen(); if (this.lastTimestamp == timestamp) { this.sequence = (this.sequence + 1) & this.sequenceMask; if (this.sequence == 0) { System.out.println("###########" + sequenceMask); timestamp = this.tilNextMillis(this.lastTimestamp); } } else { this.sequence = 0; } if (timestamp < this.lastTimestamp) { try { throw new Exception( String.format( "Clock moved backwards. Refusing to generate id for %d milliseconds", this.lastTimestamp - timestamp)); } catch (Exception e) { e.printStackTrace(); } } this.lastTimestamp = timestamp; long nextId = ((timestamp - twepoch << timestampLeftShift)) | (this.workerId << this.workerIdShift) | (this.sequence); System.out.println("timestamp:" + timestamp + ",timestampLeftShift:" + timestampLeftShift + ",nextId:" + nextId + ",workerId:" + workerId + ",sequence:" + sequence); return nextId; } private long tilNextMillis(final long lastTimestamp) { long timestamp = this.timeGen(); while (timestamp <= lastTimestamp) { timestamp = this.timeGen(); } return timestamp; } private long timeGen() { return System.currentTimeMillis(); } public static void main(String[] args){ IdWorker worker2 = new IdWorker(2); System.out.println(worker2.nextId()); } }

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值