Spark Shuffle初探

最新推荐文章于 2021-11-03 21:01:04 发布

yjh314

最新推荐文章于 2021-11-03 21:01:04 发布

阅读量355

点赞数

分类专栏： spark

spark 专栏收录该内容

27 篇文章 0 订阅

订阅专栏

之前一直疑惑Shuffle过程中的读和写究竟是在哪里实现的，一直误解读和写都是在RDD的转换过程中实现的，但是追踪代码reduceByKey，却只找到了生成ShuffledRDD的过程，然后在ShuffledRDD中的compute函数中有读取过程，那么写入过程究竟在哪里呢？？

[java]view plaincopy 
   
 PairRDDFunctions  
   
 def combineByKey[C](createCombiner: V => C,  
       mergeValue: (C, V) => C,  
       mergeCombiners: (C, C) => C,  
       partitioner: Partitioner,  
       mapSideCombine: Boolean = true,  
       serializer: Serializer = null): RDD[(K, C)] = {  
   
     val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners)  
     if (self.partitioner == Some(partitioner)) {  
       // 一般的RDD的partitioner是None，这个条件不成立，即使成立只需要对这个数据做一次按key合并value的操作即可  
       self.mapPartitionsWithContext((context, iter) => {  
         new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
       }, preservesPartitioning = true)  
     } else if (mapSideCombine) {  
       // 默认是走的这个方法，需要map端的combinber.  
       val combined = self.mapPartitionsWithContext((context, iter) => {  
         aggregator.combineValuesByKey(iter, context)  
       }, preservesPartitioning = true)  
       val partitioned = new ShuffledRDD[K, C, (K, C)](combined, partitioner)  
         .setSerializer(serializer)  
       partitioned.mapPartitionsWithContext((context, iter) => {  
         new InterruptibleIterator(context, aggregator.combineCombinersByKey(iter, context))  
       }, preservesPartitioning = true)  
     } else {  
       // 不需要map端的combine，直接就来shuffle  
       val values = new ShuffledRDD[K, V, (K, V)](self, partitioner).setSerializer(serializer)  
       values.mapPartitionsWithContext((context, iter) => {  
         new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))  
       }, preservesPartitioning = true)  
     }  
   }  

观察compute方法，会看到是如何去取上一个stage生成的数据的。

[java]view plaincopy 
   
 //ShuffledRDD.scala  
   
 package org.apache.spark.rdd  
   
 import org.apache.spark._  
 import org.apache.spark.annotation.DeveloperApi  
 import org.apache.spark.serializer.Serializer  
   
 private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition {  
   override val index: Int = idx  
   override def hashCode(): Int = idx  
 }  
   
 /** 
  * :: DeveloperApi :: 
  * The resulting RDD from a shuffle (e.g. repartitioning of data). 
  * @param prev the parent RDD. 
  * @param part the partitioner used to partition the RDD 
  * @tparam K the key class. 
  * @tparam V the value class. 
  * @tparam C the combiner class. 
  */  
 // TODO: Make this return RDD[Product2[K, C]] or have some way to configure mutable pairs  
 @DeveloperApi  
 class ShuffledRDD[K, V, C](  
     @transient var prev: RDD[_ <: Product2[K, V]],  
     part: Partitioner)  
   extends RDD[(K, C)](prev.context, Nil) {  
   
   private var serializer: Option[Serializer] = None  
   
   private var keyOrdering: Option[Ordering[K]] = None  
   
   private var aggregator: Option[Aggregator[K, V, C]] = None  
   
   private var mapSideCombine: Boolean = false  
   
   /** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */  
   def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {  
     this.serializer = Option(serializer)  
     this  
   }  
   
   /** Set key ordering for RDD's shuffle. */  
   def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {  
     this.keyOrdering = Option(keyOrdering)  
     this  
   }  
   
   /** Set aggregator for RDD's shuffle. */  
   def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {  
     this.aggregator = Option(aggregator)  
     this  
   }  
   
   /** Set mapSideCombine flag for RDD's shuffle. */  
   def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {  
     this.mapSideCombine = mapSideCombine  
     this  
   }  
   
   override def getDependencies: Seq[Dependency[_]] = {  
     List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))  
   }  
   
   override val partitioner = Some(part)  
   
   override def getPartitions: Array[Partition] = {  
     Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))  
   }  
   
   override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {  
     val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]  
     SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)  
       .read()  
       .asInstanceOf[Iterator[(K, C)]]  
   }  
   
   override def clearDependencies() {  
     super.clearDependencies()  
     prev = null  
   }  
 }  

后来想到ShuffleMapTask，这个名字就很可以，打开代码看看。发现代码很简单，直接粗暴的把结果通过ShuffleManger写入到磁盘。

[java]view plaincopy 
   
 //ShuffleMapTask.scala  
   
 package org.apache.spark.scheduler  
   
 import java.nio.ByteBuffer  
   
 import scala.language.existentials  
   
 import org.apache.spark._  
 import org.apache.spark.broadcast.Broadcast  
 import org.apache.spark.rdd.RDD  
 import org.apache.spark.shuffle.ShuffleWriter  
   
 /** 
 * A ShuffleMapTask divides the elements of an RDD into multiple buckets (based on a partitioner 
 * specified in the ShuffleDependency). 
 * 
 * See [[org.apache.spark.scheduler.Task]] for more information. 
 * 
  * @param stageId id of the stage this task belongs to 
  * @param taskBinary broadcast version of the RDD and the ShuffleDependency. Once deserialized, 
  *                   the type should be (RDD[_], ShuffleDependency[_, _, _]). 
  * @param partition partition of the RDD this task is associated with 
  * @param locs preferred task execution locations for locality scheduling 
  */  
 private[spark] class ShuffleMapTask(  
     stageId: Int,  
     taskBinary: Broadcast[Array[Byte]],  
     partition: Partition,  
     @transient private var locs: Seq[TaskLocation])  
   extends Task[MapStatus](stageId, partition.index) with Logging {  
   
   /** A constructor used only in test suites. This does not require passing in an RDD. */  
   def this(partitionId: Int) {  
     this(0, null, new Partition { override def index: Int = 0 }, null)  
   }  
   
   @transient private val preferredLocs: Seq[TaskLocation] = {  
     if (locs == null) Nil else locs.toSet.toSeq  
   }  
   
   override def runTask(context: TaskContext): MapStatus = {  
     // Deserialize the RDD using the broadcast variable.  
     val deserializeStartTime = System.currentTimeMillis()  
     val ser = SparkEnv.get.closureSerializer.newInstance()  
     val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](  
       ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)  
     _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime  
   
     metrics = Some(context.taskMetrics)  
     var writer: ShuffleWriter[Any, Any] = null  
     try {  
       val manager = SparkEnv.get.shuffleManager  
       writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)  
       writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])  
       return writer.stop(success = true).get  
     } catch {  
       case e: Exception =>  
         try {  
           if (writer != null) {  
             writer.stop(success = false)  
           }  
         } catch {  
           case e: Exception =>  
             log.debug("Could not stop writer", e)  
         }  
         throw e  
     }  
   }  
   
   override def preferredLocations: Seq[TaskLocation] = preferredLocs  
   
   override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)  
 }  

根据Stage的划分机制，只要出现ShuffleDependency，那么前面的任务就会被包装成为ShuffleMapTask，然后在ShuffleMapTask中把前面的Stage的output进行分区然后输出到硬盘，这样就可以不用考虑这次stage的final RDD的类型了，做到了shuflle write和RDD逻辑的解耦。

ShuffleManager后续发生的事情，参考https://github.com/JerryLead/SparkInternals/blob/master/markdown/4-shuffleDetails.md