Spark UI

zhixingheyi_tian

已于 2024-09-12 16:27:38 修改

阅读量114

点赞数

分类专栏： spark 文章标签： spark 大数据分布式

于 2023-06-02 15:53:47 首次发布

本文链接：https://blog.csdn.net/zhixingheyi_tian/article/details/131008370

版权

spark 专栏收录该内容

107 篇文章 4 订阅

订阅专栏

SQL Tab

这是spark-shell 提交queries，展示得都是 console

在这里插入图片描述

这是spark-sql 提交queries，展示得都是 sql

在这里插入图片描述

导入导出 eventlog

如果从其他机器导入一些eventlog

需要名字和log 的spark.app.id对应起来，才能映射成功。否则historyserver 以日志内容中的spark.app.id 为准。

"spark.app.id":"156_Vanilla_Eventlog"

另外需要知道 historyserver 会自动去重一些同样的内容的 eventlog，所以在当前目录重新scp 出一个同样内容的eventlog，
但是在 UI 上不会显示处理。

select count

在这里插入图片描述

这是 888 个 parquet 文件的 count 计算任务 input 情况。每个文件1G 左右
都是只访问 parquet meta 信息。
在这里插入图片描述

SQLTap

SQL 里 jobs 统计是通过 onJobStart 事件监听累加得到的

  override def onJobStart(event: SparkListenerJobStart): Unit = {
    val executionIdString = event.properties.getProperty(SQLExecution.EXECUTION_ID_KEY)
    if (executionIdString == null) {
      // This is not a job created by SQL
      return
    }

    val executionId = executionIdString.toLong
    val jobId = event.jobId
    val exec = Option(liveExecutions.get(executionId))
      .orElse {
        try {
          // Should not overwrite the kvstore with new entry, if it already has the SQLExecution
          // data corresponding to the execId.
          val sqlStoreData = kvstore.read(classOf[SQLExecutionUIData], executionId)
          val executionData = new LiveExecutionData(executionId)
          executionData.description = sqlStoreData.description
          executionData.details = sqlStoreData.details
          executionData.physicalPlanDescription = sqlStoreData.physicalPlanDescription
          executionData.modifiedConfigs = sqlStoreData.modifiedConfigs
          executionData.metrics = sqlStoreData.metrics
          executionData.submissionTime = sqlStoreData.submissionTime
          executionData.completionTime = sqlStoreData.completionTime
          executionData.jobs = sqlStoreData.jobs

org.apache.spark.sql.execution.ui.AllExecutionsPage

duration 计算由来

在这里插入图片描述

  override def headers: Seq[Node] = {
    // Information for each header: title, sortable, tooltip
    val executionHeadersAndCssClasses: Seq[(String, Boolean, Option[String])] =
      Seq(
        ("ID", true, None),
        ("Description", true, None),
        ("Submitted", true, None),
        ("Duration", true, Some("Time from query submission to completion (or if still executing," +
          "time since submission)"))) ++ {
        if (showRunningJobs && showSucceededJobs && showFailedJobs) {
          Seq(
            ("Running Job IDs", true, None),
            ("Succeeded Job IDs", true, None),
            ("Failed Job IDs", true, None))
        } else if (showSucceededJobs && showFailedJobs) {
          Seq(
            ("Succeeded Job IDs", true, None),
            ("Failed Job IDs", true, None))
        } else {
          Seq(("Job IDs", true, None))
        }
      }

      <td sorttable_customkey={submissionTime.toString}>
        {UIUtils.formatDate(submissionTime)}
      </td>
      <td sorttable_customkey={duration.toString}>
        {UIUtils.formatDuration(duration)}
      </td>

  override def row(executionTableRow: ExecutionTableRowData): Seq[Node] = {
    val executionUIData = executionTableRow.executionUIData
    val submissionTime = executionUIData.submissionTime
    val duration = executionTableRow.duration

private[ui] class ExecutionTableRowData(
    val duration: Long,
    val executionUIData: SQLExecutionUIData,
    val runningJobData: Seq[Int],
    val completedJobData: Seq[Int],
    val failedJobData: Seq[Int])

关键在这里：
val duration = executionUIData.completionTime.map(_.getTime())
.getOrElse(currentTime) - executionUIData.submissionTime

注意这里 getOrElse，主要以 completionTime 为主。

另外
executionUIData,
runningJobData,
completedJobData,
这三个分别是来统计各自分类的jobid 的

  private def executionRow(executionUIData: SQLExecutionUIData): ExecutionTableRowData = {
    val duration = executionUIData.completionTime.map(_.getTime())
      .getOrElse(currentTime) - executionUIData.submissionTime

    val runningJobData = if (showRunningJobs) {
      executionUIData.jobs.filter {
        case (_, jobStatus) => jobStatus == JobExecutionStatus.RUNNING
      }.map { case (jobId, _) => jobId }.toSeq.sorted
    } else Seq.empty

    val completedJobData = if (showSucceededJobs) {
      executionUIData.jobs.filter {
        case (_, jobStatus) => jobStatus == JobExecutionStatus.SUCCEEDED
      }.map { case (jobId, _) => jobId }.toSeq.sorted
    } else Seq.empty

    val failedJobData = if (showFailedJobs) {
      executionUIData.jobs.filter {
        case (_, jobStatus) => jobStatus == JobExecutionStatus.FAILED
      }.map { case (jobId, _) => jobId }.toSeq.sorted
    } else Seq.empty

    new ExecutionTableRowData(
      duration,
      executionUIData,
      runningJobData,
      completedJobData,
      failedJobData)
  }

currentTime 由来

  override def render(request: HttpServletRequest): Seq[Node] = {
    val currentTime = System.currentTimeMillis()
    val running = new mutable.ArrayBuffer[SQLExecutionUIData]()
    val completed = new mutable.ArrayBuffer[SQLExecutionUIData]()
    val failed = new mutable.ArrayBuffer[SQLExecutionUIData]()

duration long 转字符串

org.apache.spark.ui.UIUtils

  def formatDuration(milliseconds: Long): String = {
    if (milliseconds < 100) {
      return "%d ms".format(milliseconds)
    }
    val seconds = milliseconds.toDouble / 1000
    if (seconds < 1) {
      return "%.1f s".format(seconds)
    }
    if (seconds < 60) {
      return "%.0f s".format(seconds)
    }
    val minutes = seconds / 60
    if (minutes < 10) {
      return "%.1f min".format(minutes)
    } else if (minutes < 60) {
      return "%.0f min".format(minutes)
    }
    val hours = minutes / 60
    "%.1f h".format(hours)
  }

running/failed/completed 判断依据

注意这边，是从 sqlStore 里获取的。

    sqlStore.executionsList().foreach { e =>
      val isRunning = e.completionTime.isEmpty ||
        e.jobs.exists { case (_, status) => status == JobExecutionStatus.RUNNING }
      val isFailed = e.jobs.exists { case (_, status) => status == JobExecutionStatus.FAILED }
      if (isRunning) {
        running += e
      } else if (isFailed) {
        failed += e
      } else {
        completed += e
      }
    }

e.jobs 实际上用的是如下数据结构

class SQLExecutionUIData(
    @KVIndexParam val executionId: Long,
    val description: String,
    val details: String,
    val physicalPlanDescription: String,
    val modifiedConfigs: Map[String, String],
    val metrics: Seq[SQLPlanMetric],
    val submissionTime: Long,
    val completionTime: Option[Date],
    @JsonDeserialize(keyAs = classOf[Integer])
    val jobs: Map[Int, JobExecutionStatus],
    @JsonDeserialize(contentAs = classOf[Integer])
    val stages: Set[Int],
    /**
     * This field is only populated after the execution is finished; it will be null while the
     * execution is still running. During execution, aggregate metrics need to be retrieved
     * from the SQL listener instance.
     */
    @JsonDeserialize(keyAs = classOf[JLong])
    val metricValues: Map[Long, String]) {

  @JsonIgnore @KVIndex("completionTime")
  private def completionTimeIndex: Long = completionTime.map(_.getTime).getOrElse(-1L)
}

注意事项

注意 AllExecutionsPage 读取 sqlStore，
SQLAppStatusListener 写 sqlStore。
二者是异步的，

  override def onJobStart(event: SparkListenerJobStart): Unit = {
    val executionIdString = event.properties.getProperty(SQLExecution.EXECUTION_ID_KEY)
    if (executionIdString == null) {
      // This is not a job created by SQL
      return
    }

    val executionId = executionIdString.toLong
    val jobId = event.jobId
    val exec = Option(liveExecutions.get(executionId))
      .orElse {
        try {
          // Should not overwrite the kvstore with new entry, if it already has the SQLExecution
          // data corresponding to the execId.
          val sqlStoreData = kvstore.read(classOf[SQLExecutionUIData], executionId)
          val executionData = new LiveExecutionData(executionId)
          executionData.description = sqlStoreData.description
          executionData.details = sqlStoreData.details
          executionData.physicalPlanDescription = sqlStoreData.physicalPlanDescription
          executionData.modifiedConfigs = sqlStoreData.modifiedConfigs
          executionData.metrics = sqlStoreData.metrics
          executionData.submissionTime = sqlStoreData.submissionTime
          executionData.completionTime = sqlStoreData.completionTime
          executionData.jobs = sqlStoreData.jobs
          executionData.stages = sqlStoreData.stages
          executionData.metricsValues = sqlStoreData.metricValues
          executionData.endEvents.set(sqlStoreData.jobs.size + 1)
          liveExecutions.put(executionId, executionData)
          Some(executionData)
        } catch {
          case _: NoSuchElementException => None
        }
      }.getOrElse(getOrCreateExecution(executionId))

这里是的记录都是以 sqlStore 为准的。
之所以这里 executionData.endEvents.set(sqlStoreData.jobs.size + 1) 有个加1动作，
是因为 sqlend 已经接收到了。因为接收到了sqlend，liveExecutions 才会删除了这个 execution id。
所以加1，是因为之前的jobs.size + sqlend(1)

SparkListenerEvent

SparkListenerApplicationStart

case class SparkListenerApplicationStart(
    appName: String,
    appId: Option[String],
    time: Long,
    sparkUser: String,
    appAttemptId: Option[String],
    driverLogs: Option[Map[String, String]] = None,
    driverAttributes: Option[Map[String, String]] = None) extends SparkListenerEvent

  def applicationStartToJson(applicationStart: SparkListenerApplicationStart): JValue = {
    ("Event" -> SPARK_LISTENER_EVENT_FORMATTED_CLASS_NAMES.applicationStart) ~
    ("App Name" -> applicationStart.appName) ~
    ("App ID" -> applicationStart.appId.map(JString(_)).getOrElse(JNothing)) ~
    ("Timestamp" -> applicationStart.time) ~
    ("User" -> applicationStart.sparkUser) ~
    ("App Attempt ID" -> applicationStart.appAttemptId.map(JString(_)).getOrElse(JNothing)) ~
    ("Driver Logs" -> applicationStart.driverLogs.map(mapToJson).getOrElse(JNothing)) ~
    ("Driver Attributes" -> applicationStart.driverAttributes.map(mapToJson).getOrElse(JNothing))
  }

TaskLocality

相关link ： https://spark.apache.org/docs/3.3.2/tuning.html

@DeveloperApi
object TaskLocality extends Enumeration {
  // Process local is expected to be used ONLY within TaskSetManager for now.
  val PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY = Value

  type TaskLocality = Value

  def isAllowed(constraint: TaskLocality, condition: TaskLocality): Boolean = {
    condition <= constraint
  }
}

PROCESS_LOCAL data is in the same JVM as the running code. This is the best locality
possible
NODE_LOCAL data is on the same node. Examples might be in HDFS on the same node, or in
another executor on the same node. This is a little slower than PROCESS_LOCAL because the data
has to travel between processes
NO_PREF data is accessed equally quickly from anywhere and has no locality preference
RACK_LOCAL data is on the same rack of servers. Data is on a different server on the same rack
so needs to be sent over the network, typically through a single switch
ANY data is elsewhere on the network and not in the same rack