【Spark】 Spark作业执行原理--获取执行结果

最新推荐文章于 2023-03-29 10:38:31 发布

勤言不勤语

最新推荐文章于 2023-03-29 10:38:31 发布

阅读量1.2k

点赞数

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/w1992wishes/article/details/86075583

版权

本文详细介绍了Spark作业执行完成后结果的处理流程，包括执行结果的序列化、发送及如何根据TaskState获取结果。当任务完成时，依据结果大小采用不同策略返回，如直接发送、存入BlockManager或丢弃。对于不同TaskState如FINISHED、FAILED、KILLED、LOST，Spark有不同的响应机制，包括重试和资源清理。

摘要由CSDN通过智能技术生成

一、执行结果并序列化

任务执行完成后，是在 TaskRunner 的 run 方法的后半部分返回结果给 Driver 的：

override def run(): Unit = {
    ...
    // 执行任务
    val value = try {
      val res = task.run(
        taskAttemptId = taskId,
        attemptNumber = attemptNumber,
        metricsSystem = env.metricsSystem)
      threwException = false
      res
    } 
    ...
    val taskFinish = System.currentTimeMillis()
    val taskFinishCpu = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L

    // If the task has been killed, let's fail it.
    if (task.killed) {
      throw new TaskKilledException
    }
    
	// 序列化结果
    val resultSer = env.serializer.newInstance()
    val beforeSerialization = System.currentTimeMillis()
    val valueBytes = resultSer.serialize(value)
    val afterSerialization = System.currentTimeMillis()

    // Deserialization happens in two parts: first, we deserialize a Task object, which
    // includes the Partition. Second, Task.run() deserializes the RDD and function to be run.
    task.metrics.setExecutorDeserializeTime(
      (taskStart - deserializeStartTime) + task.executorDeserializeTime)
    task.metrics.setExecutorDeserializeCpuTime(
      (taskStartCpu - deserializeStartCpuTime) + task.executorDeserializeCpuTime)
    // We need to subtract Task.run()'s deserialization time to avoid double-counting
    task.metrics.setExecutorRunTime((taskFinish - taskStart) - task.executorDeserializeTime)
    task.metrics.setExecutorCpuTime(
      (taskFinishCpu - taskStartCpu) - task.executorDeserializeCpuTime)
    task.metrics.setJvmGCTime(computeTotalGcTime() - startGCTime)
    task.metrics.setResultSerializationTime(afterSerialization - beforeSerialization)

    // 序列化后的结果封装成 DirectTaskResult
    // Note: accumulator updates must be collected after TaskMetrics is updated
    val accumUpdates = task.collectAccumulatorUpdates()
    // TODO: do not serialize value twice
    val directResult = new DirectTaskResult(valueBytes, accumUpdates)
    val serializedDirectResult = ser.serialize(directResult)
    val resultSize = serializedDirectResult.limit

    // directSend = sending directly back to the driver
    val serializedResult: ByteBuffer = {
      // 生成结果大于最大值（默认1GB）直接丢弃
      if (maxResultSize > 0 && resultSize > maxResultSize) {
        logWarning(s"Finished $taskName (TID $taskId). Result is larger than maxResultSize " +
          s"(${Utils.bytesToString(resultSize)} > ${Utils.bytesToString(maxResultSize)}), " +
          s"dropping it.")
        ser.serialize(new IndirectTaskResult[Any](TaskResultBlockId(taskId), resultSize))
      // 生成结果设置的 maxDirectResultSize 且小于 最大值，则存放到 BlockManager 中，然后返回 BlockManager 的编号
      } else if (resultSize > maxDirectResultSize) {
        val blockId = TaskResultBlockId(taskId)
        env.blockManager.putBytes(
          blockId,
          new ChunkedByteBuffer(serializedDirectResult.duplicate()),
          StorageLevel.MEMORY_AND_DISK_SER)
        logInfo(
          s"Finished $taskName (TID $taskId). $resultSize bytes result sent via BlockManager)")
        ser.serialize(new IndirectTaskResult[Any](blockId, resultSize))
      // 其他结果直接返回
      } else {
        logInfo(s"Finished $taskName (TID $taskId). $re