SparkEnv初始化的角色
org.apache.spark.shuffle.sort.SortShuffleManager ///shuffleManager
org.apache.spark.MapOutputTrackerMaster
org.apache.spark.shuffle.ShuffleMemoryManager
org.apache.spark.network.netty.NettyBlockTransferService
org.apache.spark.MapOutputTrackerMaster@25e45d
org.apache.spark.serializer.JavaSerializer@dc42ab ///closureSeirializer, serializer
org.apache.spark.storage.BlockManager@16d5aa8
org.apache.spark.storage.BlockManagerMaster@a62840
org.apache.spark.network.netty.NettyBlockTransferService@148d5b2 //blockTransferService
org.apache.spark.CacheManager@1ac9928
org.apache.spark.HttpFileServer@131d67
org.apache.spark.metrics.MetricsSystem@516ac3
org.apache.spark.MapOutputTrackerMaster@25e45d
org.apache.spark.broadcast.BroadcastManager@f8008d
C:\Users\hadoop\AppData\Local\Temp\spark-7f0f46d9-28d0-4e8d-94d0-9a8f8f589d14 //sparkFilesDir
new SparkEnv(
executorId,
actorSystem,
serializer,
closureSerializer,
cacheManager,
mapOutputTracker,
shuffleManager,
broadcastManager,
blockTransferService,
blockManager,
securityManager,
httpFileServer,
sparkFilesDir,
metricsSystem,
shuffleMemoryManager,
conf)
}
分析的源代码:
package spark.examples
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object SparkWordCount {
def main(args: Array[String]) {
System.setProperty("hadoop.home.dir", "E:\\devsoftware\\hadoop-2.5.2\\hadoop-2.5.2");
val conf = new SparkConf()
conf.setAppName("SparkWordCount")
conf.setMaster("local")
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("file:///D:/word.in")
println(rdd1.toDebugString)
val rdd2 = rdd.flatMap(_.split(" "))
println("rdd2:" + rdd2.toDebugString)
val rdd3 = rdd2.map((_, 1))
println("rdd3:" + rdd3.toDebugString)
val rdd4 = rdd4.reduceByKey(_ + _);
println("rdd4:" + rdd4.toDebugString)
rdd3.saveAsTextFile("file:///D:/wordout" + System.currentTimeMillis());
sc.stop
}
}
输出的RDD依赖图是:
RDD1
rdd1:(1) file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
| file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
RDD2
rdd2:(1) FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
| file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
| file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
RDD3
rdd3:(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
| FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
| file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
| file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
rdd4:(1) ShuffledRDD[4] at reduceByKey at SparkWordCount.scala:21 []
+-(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
| FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
| file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
| file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
ResultTask的runTask方法里的func方法调用PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)
override def runTask(context: TaskContext): U = {
// Deserialize the RDD and the func using the broadcast variables.
val ser = SparkEnv.get.closureSerializer.newInstance()
val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
metrics = Some(context.taskMetrics)
func(context, rdd.iterator(partition, context)) //调用PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)
}
PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)
val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
val config = wrappedConf.value
// Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
// around by taking a mod. We expect that no task will be attempted 2 billion times.
val attemptNumber = (context.attemptId % Int.MaxValue).toInt
val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)
writer.setup(context.stageId, context.partitionId, attemptNumber)
writer.open()
try {
var recordsWritten = 0L
while (iter.hasNext) {
val record = iter.next()
writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
// Update bytes written metric every few records
maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten)
recordsWritten += 1
}
} finally {
writer.close()
}
writer.commit()
bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
}