概述:
spark的执行流程网上一大堆,我就不写了。
本文以sparkWordCount为例来解析
val input=sc.textFile("C:\\Users\\pc\\PycharmProjects\\PyNews\\word*",1)
.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).saveAsTextFile("D:/niha")
ok,先看第一步textFile:
/**
* Read a text file from HDFS, a local file system (available on all nodes), or any
* Hadoop-supported file system URI, and return it as an RDD of Strings.
*/
//支持hadoop所能读的文件系统
def textFile(
path: String,
minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
assertNotStopped()
//调用了hadoopFile
//注意:TextInputFormat,LongWritable,Text 这些都是hadoop里面的数据类型哦
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString).setName(path)
}
/** Get an RDD for a Hadoop file with an arbitrary InputFormat
*
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
* operation will create many references to the same object.
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
* copy them using a `map` function.
*/
//翻译:
//使用任意的InputFormat 都能获取到HadoopRDD 意思就是说我们可以重写InputFormat或者使用其他InputFormat 来读取我们想要的数据。
//
//注意:这个RecordReader类能被相同的Writable对象多次使用,
//直接缓存这个RDD或者拿来做聚合或者shuffle操作会创建很多相同的引用对同一个对象,如果你想这么操作的话,
//要先使用map函数复制他们
def hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
assertNotStopped()
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
//将 Hadoop configuration 广播出去
val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
//偏函数用于以后设置输入路径
val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
//构建HadoopRDD
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path)
}
在textFile方法获取到HadoopRDD之后又调用了RDD的map方法。
/**
* Return a new RDD by applying a function to all elements of this RDD.
*/
def map[U: ClassTag](f: T => U): RDD[U] = withScope {
//清除那些不能序列化的变量防止,RDD在网络传输过程中反序列话失败
val cleanF = sc.clean(f)
//调用了MapPartitionsRDD的父类RDD的辅助构造器
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
}
/** Construct an RDD with just a one-to-one dependency on one parent */
def this(@transient oneParent: RDD[_]) =
//将oneParent RDD封装为OneToOneDependency
this(oneParent.context , List(new OneToOneDependency(oneParent)))
//而OneToOneDependency继承自NarrowDependency(窄依赖)
class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
override def getParents(partitionId: Int): List[Int] = List(partitionId)
}
//调用RDD主构造器
abstract class RDD[T: ClassTag](
@transient private var _sc: SparkContext,
//当前就为List(new OneToOneDependency(hadoopRDD))
@transient private var deps: Seq[Dependency[_]]
) extends Serializable with Logging {
//以下几个是RDD比较重要的方法
/**
* :: DeveloperApi ::
* Implemented by subclasses to compute a given partition.
*/
@DeveloperApi
//RDD计算函数,由子类实现
def compute(split: Partition, context: TaskContext): Iterator[T]
/**
* Implemented by subclasses to return the set of partitions in this RDD. This method will only
* be called once, so it is safe to implement a time-consuming computation in it.
*/
//获取这个RDD的分区的集合,由子类实现
protected def getPartitions: Array[Partition]
/**
* Implemented by subclasses to return how this RDD depends on parent RDDs. This method will only
* be called once, so it is safe to implement a time-consuming computation in it.
*/
//获取当前RDD的父RDDs,由子类实现
protected def getDependencies: Seq[Dependency[_]] = deps
}
最后就是MapPartitionsRDD.deps------>List(new OneToOneDependency(hadoopRDD)) hadoopRDD.deps----->null
下一步flatMap
/**
* Return a new RDD by first applying a function to all elements of this
* RDD, and then flattening the results.
*/
//跟map相同
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.flatMap(cleanF))
}
所以RDD链现在生成
MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))
MapPartitionsRDD--------->List(new OneToOneDependency(hadoopRDD))
hadoopRDD.deps----->null
接下来又是map:
所以RDD链生成
MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))
MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))
MapPartitionsRDD.deps--------->List(new OneToOneDependency(hadoopRDD))
hadoopRDD.deps----->null
接下来是reduceByKey:
首先 MapPartitionsRDD和RDD是没有reduceBykey这个函数的,是通过隐式转换实现调用的。
/**
* Merge the values for each key using an associative reduce function. This will also perform
* the merging locally on each mapper before sending results to a reducer, similarly to a
* "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
* parallelism level.
*/
//
def reduceByKey(func: (V, V) => V): RDD[(K, V)] = self.withScope {
reduceByKey(defaultPartitioner(self), func)
}
//defaultPartitioner 代码如下
/**
* Choose a partitioner to use for a cogroup-like operation between a number of RDDs.
*
* If any of the RDDs already has a partitioner, choose that one.
*
* Otherwise, we use a default HashPartitioner. For the number of partitions, if
* spark.default.parallelism is set, then we'll use the value from SparkContext
* defaultParallelism, otherwise we'll use the max number of upstream partitions.
*
* Unless spark.default.parallelism is set, the number of partitions will be the
* same as the number of partitions in the largest upstream RDD, as this should
* be least likely to cause out-of-memory errors.
*
* We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD.
*/
//简单翻译: 在进行cogroup这样的操作的时候选择一个分区器,用在这些RDD身上。
//如果任何一个RDD已经有一个了,那就选它。
//不然就用默认的HashPartitioner,对于分区数量,就看你设置了spark.default.parallelism这个值没有设.
//没设就选择RDD依赖链中RDD分区数最多的那个数。
def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
//partitions获取到每个RDD的分区数
//排个序获取最大的那个
val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
for (r <- bySize if r.partitioner.isDefined && r.partitioner.get.numPartitions > 0) {
return r.partitioner.get
}
if (rdd.context.conf.contains("spark.default.parallelism")) {
new HashPartitioner(rdd.context.defaultParallelism)
} else {
new HashPartitioner(bySize.head.partitions.size)
}
}
/**
* Get the array of partitions of this RDD, taking into account whether the
* RDD is checkpointed or not.
*/
final def partitions: Array[Partition] = {
checkpointRDD.map(_.partitions).getOrElse {
if (partitions_ == null) {
//getPartitions获取该RDD的分区
partitions_ = getPartitions
}
partitions_
}
}
//子类MapPartitionsRDD的getPartitions实现
//调用当前RDD所依赖的第一个父RDD的partitions,产生循环,一直循环到HadoopRDD,最终调用HadoopRDD的getPartitions
override def getPartitions: Array[Partition] = firstParent[T].partitions
//firstParent的实现
//获取当前RDD所依赖的第一个父RDD
protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
dependencies.head.rdd.asInstanceOf[RDD[U]]
}
/**
* Get the list of dependencies of this RDD, taking into account whether the
* RDD is checkpointed or not.
*/
//获取
final def dependencies: Seq[Dependency[_]] = {
checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse {
if (dependencies_ == null) {
//获取当前RDD的依赖链
dependencies_ = getDependencies
}
dependencies_
}
}
//获取依赖链deps
protected def getDependencies: Seq[Dependency[_]] = deps
//HadoopRDD的getPartitions
override def getPartitions: Array[Partition] = {
val jobConf = getJobConf()
// add the credentials here as this can be called before SparkContext initialized
SparkHadoopUtil.get.addCredentials(jobConf)
val inputFormat = getInputFormat(jobConf)
//通过inputFormat.getSplits来获取的分区数
val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
val array = new Array[Partition](inputSplits.size)
for (i <- 0 until inputSplits.size) {
array(i) = new HadoopPartition(id, i, inputSplits(i))
}
array
}
--------------------------
最终调用如下方法
/**
* Merge the values for each key using an associative reduce function. This will also perform
* the merging locally on each mapper before sending results to a reducer, similarly to a
* "combiner" in MapReduce.
*/
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
}
/**
* :: Experimental ::
* Generic function to combine the elements for each key using a custom set of aggregation
* functions. Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a "combined type" C
* Note that V and C can be different -- for example, one might group an RDD of type
* (Int, Int) into an RDD of type (Int, Seq[Int]). Users provide three functions:
*
* - `createCombiner`, which turns a V into a C (e.g., creates a one-element list)
* - `mergeValue`, to merge a V into a C (e.g., adds it to the end of a list)
* - `mergeCombiners`, to combine two C's into a single one.
*
* In addition, users can control the partitioning of the output RDD, and whether to perform
* map-side aggregation (if a mapper can produce multiple items with the same key).
*/
//翻译:
//一个通用(最底层)的函数:使用一个自定义的聚集函数来作用于每一个key
//将 RDD[K,V]类型 变为 RDD[K,V]类型,对于 聚集后的类型C 可以与V 不同
//比如(Int, Int) 聚集之后 (Int, Seq[Int])
//createCombiner 将V类型变为C类型
//mergeValue 将V的值合并到C中
//mergeCombiners 将两个 C类型的值合并为一个
@Experimental
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("Default partitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
//在这里创建了ShuffledRDD
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}
//ShuffledRDD实现 构造器中的prev为调用reduceBykey那个RDD即为MapPartitionRDD
class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
@transient var prev: RDD[_ <: Product2[K, V]],
part: Partitioner)
extends RDD[(K, C)](prev.context, Nil) {
private var serializer: Option[Serializer] = None
private var keyOrdering: Option[Ordering[K]] = None
private var aggregator: Option[Aggregator[K, V, C]] = None
private var mapSideCombine: Boolean = false
/** Set a serializer for this RDD's shuffle, or null to use the default (spark.serializer) */
def setSerializer(serializer: Serializer): ShuffledRDD[K, V, C] = {
this.serializer = Option(serializer)
this
}
/** Set key ordering for RDD's shuffle. */
def setKeyOrdering(keyOrdering: Ordering[K]): ShuffledRDD[K, V, C] = {
this.keyOrdering = Option(keyOrdering)
this
}
/** Set aggregator for RDD's shuffle. */
def setAggregator(aggregator: Aggregator[K, V, C]): ShuffledRDD[K, V, C] = {
this.aggregator = Option(aggregator)
this
}
/** Set mapSideCombine flag for RDD's shuffle. */
def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = {
this.mapSideCombine = mapSideCombine
this
}
//只有在使用该方法时才会构建ShuffleDependency
override def getDependencies: Seq[Dependency[_]] = {
//其中prev为MapPartitionRDD
//所以构建为new ShuffleDependency(MapPartitionRDD,part, serializer, keyOrdering, aggregator, mapSideCombine))
List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))
}
override val partitioner = Some(part)
override def getPartitions: Array[Partition] = {
Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
}
override protected def getPreferredLocations(partition: Partition): Seq[String] = {
val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
tracker.getPreferredLocationsForShuffle(dep, partition.index)
}
override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = {
val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]]
SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context)
.read()
.asInstanceOf[Iterator[(K, C)]]
}
override def clearDependencies() {
super.clearDependencies()
prev = null
}
}
ok现在的RDD链为:
ShuffleRDD.deps-------------->List(new ShuffleDependency(MapPartitionsRDD))
MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))
MapPartitionsRDD.deps------>List(new OneToOneDependency(MapPartitionsRDD))
MapPartitionsRDD.deps--------->List(new OneToOneDependency(hadoopRDD))
hadoopRDD.deps----->null
这也就是传说中的DAG图