DAG的创建

从最简单的WordCount开始:

def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("WordCount").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.textFile("C:\\Users\\macbook air\\Desktop\\test3.txt")
    val rdd2=rdd1.map((_,1))
    val rdd3=rdd2.reduceByKey(_+_)

    println(rdd3.collect().toList)
  }

1.textFile创建初始hadoopRDD

def textFile(
      path: String,
      minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
    assertNotStopped()
    //了解MapReduce看到TextInputFormat和LonWritable有没有一股熟悉的感觉
    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
      //map转换成MapPartitionsRDD
      minPartitions).map(pair => pair._2.toString).setName(path)
  }

接着进入hadoopFile

  def hadoopFile[K, V](
      path: String,
      inputFormatClass: Class[_ <: InputFormat[K, V]],
      keyClass: Class[K],
      valueClass: Class[V],
      minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
    assertNotStopped()

    // 强制加载hdfs-site.xml
    FileSystem.getLocal(hadoopConfiguration)

    // 一个Hadoop配置可以大约10 KiB,这是非常大的,所以广播它
    val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
    val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
    new HadoopRDD(
      this,
      confBroadcast,
      Some(setInputPathsFunc),
      inputFormatClass,
      keyClass,
      valueClass,
      minPartitions).setName(path)
  }

因为主要探究DAG,其它的暂时放下,HadoopRDD

class HadoopRDD[K, V](
    sc: SparkContext,
    broadcastedConf: Broadcast[SerializableConfiguration],
    initLocalJobConfFuncOpt: Option[JobConf => Unit],
    inputFormatClass: Class[_ <: InputFormat[K, V]],
    keyClass: Class[K],
    valueClass: Class[V],
    minPartitions: Int)
  // 调用父类RDD的构造方法
  extends RDD[(K, V)](sc, Nil) with Logging

进入RDD

abstract class RDD[T: ClassTag](
    @transient private var _sc: SparkContext,
    //调用主构造方法,deps值为Nil,类型为List[Nothing]
    @transient private var deps: Seq[Dependency[_]]
  ) extends Serializable with Logging {

   def this(@transient oneParent: RDD[_]) =
    this(oneParent.context, List(new OneToOneDependency(oneParent)))

   //getDependencies是构建DAG和stage的关键
   protected def getDependencies: Seq[Dependency[_]] = deps

   private var dependencies_ : Seq[Dependency[_]] = _
   final def dependencies: Seq[Dependency[_]] = {
    //检查点RDD
    checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
      //暂时可以简单地看为如果dependencies_ 为null,则赋值
      if (dependencies_ == null) {
        dependencies_ = getDependencies
      }
      dependencies_
    }
  }

}

hadoopFile返回:hadoopRDD:deps:Nil

rdd1:MapPartitionsRDD:deps:List(new OneToOneDependency(hadoopRDD))

 

2.MapPartitionsRDD

def map[U: ClassTag](f: T => U): RDD[U] = withScope {
    val cleanF = sc.clean(f)
    //创建
    new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
  }

创建

private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
    var prev: RDD[T],
    f: (TaskContext, Int, Iterator[T]) => Iterator[U],  // (TaskContext, partition index, iterator)
    preservesPartitioning: Boolean = false,
    isFromBarrier: Boolean = false,
    isOrderSensitive: Boolean = false)
  //调用父类构造方法
  extends RDD[U](prev)

RDD

//调用辅助构造方法
def this(@transient oneParent: RDD[_]) =
    this(oneParent.context, List(new OneToOneDependency(oneParent)))

Dependency

class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
  override def getParents(partitionId: Int): List[Int] = List(partitionId)
}

abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
  def getParents(partitionId: Int): Seq[Int]
  //父RDD
  override def rdd: RDD[T] = _rdd
}

abstract class Dependency[T] extends Serializable {
  def rdd: RDD[T]
}

rdd2:MapPartitionsRDD:deps:List(new OneToOneDependency(rdd1))

 

3.ShuffledRDD

def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
    reduceByKey(defaultPartitioner(self), func)
  }

def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = 
self.withScope {
    combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
  }

combineByKeyWithClassTag

def combineByKeyWithClassTag[C](
      createCombiner: V => C,
      mergeValue: (C, V) => C,
      mergeCombiners: (C, C) => C,
      partitioner: Partitioner,
      mapSideCombine: Boolean = true,
      serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
    if (keyClass.isArray) {
      if (mapSideCombine) {
        throw new SparkException("Cannot use map-side combining with array keys.")
      }
      if (partitioner.isInstanceOf[HashPartitioner]) {
        throw new SparkException("HashPartitioner cannot partition array keys.")
      }
    }
    val aggregator = new Aggregator[K, V, C](
      self.context.clean(createCombiner),
      self.context.clean(mergeValue),
      self.context.clean(mergeCombiners))
    if (self.partitioner == Some(partitioner)) {
      self.mapPartitions(iter => {
        val context = TaskContext.get()
        new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
      }, preservesPartitioning = true)
    } else {
      //创建
      new ShuffledRDD[K, V, C](self, partitioner)
        .setSerializer(serializer)
        .setAggregator(aggregator)
        .setMapSideCombine(mapSideCombine)
    }
  }

创建ShuffledRDD

class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
    @transient var prev: RDD[_ <: Product2[K, V]],
    part: Partitioner)
  extends RDD[(K, C)](prev.context, Nil) {

//与RDD不同
override def getDependencies: Seq[Dependency[_]] = {
    val serializer = userSpecifiedSerializer.getOrElse {
      val serializerManager = SparkEnv.get.serializerManager
      if (mapSideCombine) {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
      } else {
        serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
      }
    }
    List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
  }

ShuffleDependency

class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
    @transient private val _rdd: RDD[_ <: Product2[K, V]],
    val partitioner: Partitioner,
    val serializer: Serializer = SparkEnv.get.serializer,
    val keyOrdering: Option[Ordering[K]] = None,
    val aggregator: Option[Aggregator[K, V, C]] = None,
    val mapSideCombine: Boolean = false,
    val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor)
  extends Dependency[Product2[K, V]]{
//父RDD
override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
}

rdd3:ShuffledRDD:deps:List(new ShuffleDependency(rdd2))

 

4.DAG

hadoopFile返回:hadoopRDD:deps:Nil

rdd1:MapPartitionsRDD:deps:List(new OneToOneDependency(hadoopRDD))

rdd2:MapPartitionsRDD:deps:List(new OneToOneDependency(rdd1))

rdd3:ShuffledRDD:deps:List(new ShuffleDependency(rdd2))

串起来

rdd3.deps.rdd->rdd2

rdd2.deps.rdd->rdd1

rdd1.deps.rdd->hadoopRDD

hadoopFile.deps->Nil

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值