DAG的创建

最新推荐文章于 2019-12-14 18:38:26 发布

MeKa

最新推荐文章于 2019-12-14 18:38:26 发布

阅读量269

点赞数

分类专栏： Spark源码系列

本文链接：https://blog.csdn.net/qq_38601362/article/details/100159951

版权

Spark源码系列专栏收录该内容

8 篇文章 0 订阅

订阅专栏

从最简单的WordCount开始：

def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.textFile("C:\\Users\\macbook air\\Desktop\\test3.txt")
    val rdd2=rdd1.map((_,1))
    val rdd3=rdd2.reduceByKey(_+_)

    println(rdd3.collect().toList)
  }

1.textFile创建初始hadoopRDD：

def textFile(
      path: String,
      minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
    assertNotStopped()
    //了解MapReduce看到TextInputFormat和LonWritable有没有一股熟悉的感觉
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      //map转换成MapPartitionsRDD
      minPartitions).map(pair => pair._2.toString).setName(path)
  }

接着进入hadoopFile：

  def hadoopFile[K, V](
      path: String,
      inputFormatClass: Class[_ <: InputFormat[K, V]],
      keyClass: Class[K],
      valueClass: Class[V],
      minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
    assertNotStopped()

    // 强制加载hdfs-site.xml
    FileSystem.getLocal(hadoopConfiguration)

    // 一个Hadoop配置可以大约10 KiB，这是非常大的，所以广播它
    val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
    val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
    new HadoopRDD(
      this,
      confBroadcast,
      Some(setInputPathsFunc),
      inputFormatClass,
      keyClass,
      valueClass,
      minPartitions).setName(path)
  }

因为主要探究DAG，其它的暂时放下，HadoopRDD：

class HadoopRDD[K, V](
    sc: SparkContext,
    broadcastedConf: Broadcast[SerializableConfiguration],
    initLocalJobConfFuncOpt: Option[JobConf => Unit],
    inputFormatClass: Class[_ <: InputFormat[K, V]],
    keyClass: Class[K],
    valueClass: Class[V],
    minPartitions: Int)
  // 调用父类RDD的构造方法
  extends RDD[(K, V)](sc, Nil) with Logging

进入RDD：

abstract class RDD[T: ClassTag](
    @transient private var _sc: SparkContext,
    //调用主构造方法，deps值为Nil,类型为List[Nothing]
    @transient private var deps: Seq[Dependency[_]]
  ) extends Serializable with Logging {

   def this(@transient oneParent: RDD[_]) =
    this(oneParent.context, List(new OneToOneDependency(oneParent)))

   //getDependencies是构建DAG和stage的关键
   protected def getDependencies: Seq[Dependency[_]] = deps

   private var dependencies_ : Seq[Dependency[_]] = _
   final def dependencies: Seq[Dependency[_]] = {
    //检查点RDD
    checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
      //暂时可以简单地看为如果dependencies_ 为null，则赋值
      if (dependencies_ == null) {
        dependencies_ = getDependencies
      }
      dependencies_
    }
  }

}

hadoopFile返回：hadoopRDD：deps:Nil

rdd1：MapPartitionsRDD：deps:List(new OneToOneDependency(hadoopRDD))

2.MapPartitionsRDD：

def map[U: ClassTag](f: T => U): RDD[U] = withScope {
    val cleanF = sc.clean(f)
    //创建
    new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
  }

创建：

private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
    var prev: RDD[T],
    f: (TaskContext, Int, Iterator[T]) => Iterator[U],  // (TaskContext, partition index, iterator)
    preservesPartitioning: Boolean = false,
    isFromBarrier: Boolean = false,
    isOrderSensitive: Boolean = false)
  //调用父类构造方法
  extends RDD[U](prev)

RDD：

//调用辅助构造方法
def this(@transient oneParent: RDD[_]) =
    this(oneParent.context, List(new OneToOneDependency(oneParent)))

Dependency：

class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
  override def getParents(partitionId: Int): List[Int] = List(partitionId)
}

abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  def getParents(partitionId: Int): Seq[Int]
  //父RDD
  override def rdd: RDD[T] = _rdd
}

abstract class Dependency[T] extends Serializable {
  def rdd: RDD[T]
}

rdd2：MapPartitionsRDD：deps:List(new OneToOneDependency(rdd1))

3.ShuffledRDD：

def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
    reduceByKey(defaultPartitioner(self), func)
  }

def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = 
self.withScope {
    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
  }

combineByKeyWithClassTag：

def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      partitioner: Partitioner,
      mapSideCombine: Boolean = true,
      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
    if (keyClass.isArray) {
      if (mapSideCombine) {
        throw new SparkException("Cannot use map-side combining with array keys.")
      }
      if (partitioner.isInstanceOf[HashPartitioner]) {
        throw new SparkException("HashPartitioner cannot partition array keys.")
      }
    }
    val aggregator = new Aggregator[K, V, C](
      self.context.clean(createCombiner),
      self.context.clean(mergeValue),
      self.context.clean(mergeCombiners))
    if (self.partitioner == Some(partitioner)) {
      self.mapPartitions(iter => {
        val context = TaskContext.get()
        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
      }, preservesPartitioning = true)
    } else {
      //创建
      new ShuffledRDD[K, V, C](self, partitioner)
        .setSerializer(serializer)
        .setAggregator(aggregator)
        .setMapSideCombine(mapSideCombine)
    }
  }

创建ShuffledRDD：

class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
    @transient var prev: RDD[_ <: Product2[K, V]],
    part: Partitioner)
  extends RDD[(K, C)](prev.context, Nil) {

//与RDD不同
override def getDependencies: Seq[Dependency[_]] = {
    val serializer = userSpecifiedSerializer.getOrElse {
      val serializerManager = SparkEnv.get.serializerManager
      if (mapSideCombine) {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
      } else {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
      }
    }
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

ShuffleDependency：

class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
    @transient private val _rdd: RDD[_ <: Product2[K, V]],
    val partitioner: Partitioner,
    val serializer: Serializer = SparkEnv.get.serializer,
    val keyOrdering: Option[Ordering[K]] = None,
    val aggregator: Option[Aggregator[K, V, C]] = None,
    val mapSideCombine: Boolean = false,
    val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor)
  extends Dependency[Product2[K, V]]{
//父RDD
override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
}

rdd3：ShuffledRDD：deps:List(new ShuffleDependency(rdd2))

4.DAG：

hadoopFile返回：hadoopRDD：deps:Nil

rdd1：MapPartitionsRDD：deps:List(new OneToOneDependency(hadoopRDD))

rdd2：MapPartitionsRDD：deps:List(new OneToOneDependency(rdd1))

rdd3：ShuffledRDD：deps:List(new ShuffleDependency(rdd2))

串起来：

rdd3.deps.rdd->rdd2

rdd2.deps.rdd->rdd1

rdd1.deps.rdd->hadoopRDD

hadoopFile.deps->Nil

MeKa

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
DAG的创建

从最简单的WordCount开始：def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("WordCount").setMaster("local") val sc = new SparkContext(conf) val rdd1 = sc.textFile("C...
复制链接

扫一扫