Spark的Job提交流程以及相关知识

  • 阶段数量 = 1 + shuffle依赖的个数
  • 任务的数量 = 一个阶段中最后一个RDD的分区数 
  • Job的数量 = 一个Spark应用中调用行动算子的次数

 

spark提交作业 调用action算子 --> 调用 RDD 类的runJob方法  --> 调用 SparkContext 类的 dagScheduler.runJob方法

--> DAGScheduler.handleJobSubmitted 方法

    生成 finalStage 

    finalStage = CreateResultStage()

    SubmitStage(finalStage)   // 在这个方法里面体现出一个分区对应一个task。核心代码是 partitionsToCompute.map

-->CreateResultStage() 方法中 getOrCreateParentStage(),这个方法里面会算出每一个shuffleDependices,就是宽依赖,对每一个宽依赖调用map算子,创建一个stage。就是说按照shuffle切分stage  

 

(1)	rdd.foreach()
	
(2)	RDD{		// RDD类
		def foreach{	// 方法
			sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
		}}
(3)	SparkContext    	//类
			
	runJob(rdd, func, 0 until rdd.partitions.length)	
	runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
	runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
	dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)

(4) DAGScheduler
	
	val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
	
	submitJob{
		val maxPartitions = rdd.partitions.length	
		
		val jobId = nextJobId.getAndIncrement()
		
		eventProcessLoop.post(JobSubmitted(
		  jobId, rdd, func2, partitions.toArray, callSite, waiter,
		  SerializationUtils.clone(properties)))
	}
	
	private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
    case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
	}
	
	
	
	private[scheduler] def handleJobSubmitted {
		var finalStage: ResultStage = null
		try {
		  // New stage creation may throw an exception if, for example, jobs are run on a
		  // HadoopRDD whose underlying HDFS files have been deleted.
		  finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
		}
                // 创建job
		val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
		// 提交stage
		submitStage(finalStage)
	}
	
	
			private def createResultStage: ResultStage = {
				// 判断有无shuffle,若有就再创建stage
				val parents = getOrCreateParentStages(rdd, jobId)
				val id = nextStageId.getAndIncrement()
				// 无论有无shuffle,一定有一个阶段 ResultStage
				val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
				stageIdToStage(id) = stage
				updateJobIdStageIdMaps(jobId, stage)
				stage
			}
		
						private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
							// 对每一个shuffle依赖进行map算子,也就是说针对每一个shuffle生成一个stage
							getShuffleDependencies(rdd).map { shuffleDep =>
							  getOrCreateShuffleMapStage(shuffleDep, firstJobId)
							}.toList
						}
								private[scheduler] def getShuffleDependencies(
									  rdd: RDD[_]): HashSet[ShuffleDependency[_, _, _]] = {
									val parents = new HashSet[ShuffleDependency[_, _, _]]
									val visited = new HashSet[RDD[_]]
									val waitingForVisit = new Stack[RDD[_]]
									// 将rdd压栈
									waitingForVisit.push(rdd)
									// 判断栈是否为空,刚刚放进去怎么会为空
									while (waitingForVisit.nonEmpty) {
										// 弹栈
									  val toVisit = waitingForVisit.pop()
									  // 刚刚被弹出来的元素(刚刚放进去的rdd)没有被访问
									  if (!visited(toVisit)) {
										visited += toVisit
										// 对rdd的每一个依赖进行遍历
										toVisit.dependencies.foreach {
										  case shuffleDep: ShuffleDependency[_, _, _] =>
											// 将依赖放进parents
											parents += shuffleDep
										  case dependency =>
											waitingForVisit.push(dependency.rdd)
										}
									  }
									}
									// 最终返回所有依赖
									parents
								}
	
	
	        private def submitStage(stage: Stage) {
				// 提交 Tasks
				submitMissingTasks(stage, jobId.get){
					......
					val tasks: Seq[Task[_]] = try {
					  stage match {
						case stage: ShuffleMapStage =>
						  partitionsToCompute.map { id =>		// 针对每一个分区,new 一个shuffleMapTask,也就是说一个分区就是一个Task
							val locs = taskIdToLocations(id)
							val part = stage.rdd.partitions(id)
							new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
							  taskBinary, part, locs, stage.latestInfo.taskMetrics, properties, Option(jobId),
							  Option(sc.applicationId), sc.applicationAttemptId)
						  }

						case stage: ResultStage =>
						  partitionsToCompute.map { id =>
							val p: Int = stage.partitions(id)
							val part = stage.rdd.partitions(p)
							val locs = taskIdToLocations(id)
							new ResultTask(stage.id, stage.latestInfo.attemptId,
							  taskBinary, part, locs, id, properties, stage.latestInfo.taskMetrics,
							  Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
						  }
					  }
					}
					......
					// 提交任务
					taskScheduler.submitTasks(new TaskSet(tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
				}
			}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值