从最简单的WordCount开始:
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCount").setMaster("local")
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("C:\\Users\\macbook air\\Desktop\\test3.txt")
val rdd2=rdd1.map((_,1))
val rdd3=rdd2.reduceByKey(_+_)
println(rdd3.collect().toList)
}
1.textFile创建初始hadoopRDD:
def textFile(
path: String,
minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
assertNotStopped()
//了解MapReduce看到TextInputFormat和LonWritable有没有一股熟悉的感觉
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
//map转换成MapPartitionsRDD
minPartitions).map(pair => pair._2.toString).setName(path)
}
接着进入hadoopFile:
def hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
assertNotStopped()
// 强制加载hdfs-site.xml
FileSystem.getLocal(hadoopConfiguration)
// 一个Hadoop配置可以大约10 KiB,这是非常大的,所以广播它
val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path)
}
因为主要探究DAG,其它的暂时放下,HadoopRDD:
class HadoopRDD[K, V](
sc: SparkContext,
broadcastedConf: Broadcast[SerializableConfiguration],
initLocalJobConfFuncOpt: Option[JobConf => Unit],
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int)
// 调用父类RDD的构造方法
extends RDD[(K, V)](sc, Nil) with Logging
进入RDD:
abstract class RDD[T: ClassTag](
@transient private var _sc: SparkContext,
//调用主构造方法,deps值为Nil,类型为List[Nothing]
@transient private var deps: Seq[Dependency[_]]
) extends Serializable with Logging {
def this(@transient oneParent: RDD[_]) =
this(oneParent.context, List(new OneToOneDependency(oneParent)))
//getDependencies是构建DAG和stage的关键
protected def getDependencies: Seq[Dependency[_]] = deps
private var dependencies_ : Seq[Dependency[_]] = _
final def dependencies: Seq[Dependency[_]] = {
//检查点RDD
checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
//暂时可以简单地看为如果dependencies_ 为null,则赋值
if (dependencies_ == null) {
dependencies_ = getDependencies
}
dependencies_
}
}
}
hadoopFile返回:hadoopRDD:deps:Nil
rdd1:MapPartitionsRDD:deps:List(new OneToOneDependency(hadoopRDD))
2.MapPartitionsRDD:
def map[U: ClassTag](f: T => U): RDD[U] = withScope {
val cleanF = sc.clean(f)
//创建
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
}
创建:
private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
var prev: RDD[T],
f: (TaskContext, Int, Iterator[T]) => Iterator[U], // (TaskContext, partition index, iterator)
preservesPartitioning: Boolean = false,
isFromBarrier: Boolean = false,
isOrderSensitive: Boolean = false)
//调用父类构造方法
extends RDD[U](prev)
RDD:
//调用辅助构造方法
def this(@transient oneParent: RDD[_]) =
this(oneParent.context, List(new OneToOneDependency(oneParent)))
Dependency:
class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
override def getParents(partitionId: Int): List[Int] = List(partitionId)
}
abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
def getParents(partitionId: Int): Seq[Int]
//父RDD
override def rdd: RDD[T] = _rdd
}
abstract class Dependency[T] extends Serializable {
def rdd: RDD[T]
}
rdd2:MapPartitionsRDD:deps:List(new OneToOneDependency(rdd1))
3.ShuffledRDD:
def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
reduceByKey(defaultPartitioner(self), func)
}
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] =
self.withScope {
combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
}
combineByKeyWithClassTag:
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
//创建
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}
创建ShuffledRDD:
class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
@transient var prev: RDD[_ <: Product2[K, V]],
part: Partitioner)
extends RDD[(K, C)](prev.context, Nil) {
//与RDD不同
override def getDependencies: Seq[Dependency[_]] = {
val serializer = userSpecifiedSerializer.getOrElse {
val serializerManager = SparkEnv.get.serializerManager
if (mapSideCombine) {
serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]])
} else {
serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]])
}
}
List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
}
ShuffleDependency:
class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
@transient private val _rdd: RDD[_ <: Product2[K, V]],
val partitioner: Partitioner,
val serializer: Serializer = SparkEnv.get.serializer,
val keyOrdering: Option[Ordering[K]] = None,
val aggregator: Option[Aggregator[K, V, C]] = None,
val mapSideCombine: Boolean = false,
val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor)
extends Dependency[Product2[K, V]]{
//父RDD
override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
}
rdd3:ShuffledRDD:deps:List(new ShuffleDependency(rdd2))
4.DAG:
hadoopFile返回:hadoopRDD:deps:Nil
rdd1:MapPartitionsRDD:deps:List(new OneToOneDependency(hadoopRDD))
rdd2:MapPartitionsRDD:deps:List(new OneToOneDependency(rdd1))
rdd3:ShuffledRDD:deps:List(new ShuffleDependency(rdd2))
串起来:
rdd3.deps.rdd->rdd2
rdd2.deps.rdd->rdd1
rdd1.deps.rdd->hadoopRDD
hadoopFile.deps->Nil