【Spark二六】Spark代码剖析

SparkEnv初始化的角色

 

 

	org.apache.spark.shuffle.sort.SortShuffleManager   ///shuffleManager
	org.apache.spark.MapOutputTrackerMaster
	org.apache.spark.shuffle.ShuffleMemoryManager
	org.apache.spark.network.netty.NettyBlockTransferService
	org.apache.spark.MapOutputTrackerMaster@25e45d
	org.apache.spark.serializer.JavaSerializer@dc42ab   ///closureSeirializer, serializer
	org.apache.spark.storage.BlockManager@16d5aa8
	org.apache.spark.storage.BlockManagerMaster@a62840
	org.apache.spark.network.netty.NettyBlockTransferService@148d5b2   //blockTransferService
	org.apache.spark.CacheManager@1ac9928
	org.apache.spark.HttpFileServer@131d67
	org.apache.spark.metrics.MetricsSystem@516ac3
	org.apache.spark.MapOutputTrackerMaster@25e45d
	org.apache.spark.broadcast.BroadcastManager@f8008d
	C:\Users\hadoop\AppData\Local\Temp\spark-7f0f46d9-28d0-4e8d-94d0-9a8f8f589d14   //sparkFilesDir


    new SparkEnv(
      executorId,
      actorSystem,
      serializer,
      closureSerializer,
      cacheManager,
      mapOutputTracker,
      shuffleManager,
      broadcastManager,
      blockTransferService,
      blockManager,
      securityManager,
      httpFileServer,
      sparkFilesDir,
      metricsSystem,
      shuffleMemoryManager,
      conf)
  }

 

分析的源代码:

 

package spark.examples

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.SparkContext._

object SparkWordCount {
  def main(args: Array[String]) {
    System.setProperty("hadoop.home.dir", "E:\\devsoftware\\hadoop-2.5.2\\hadoop-2.5.2");
    val conf = new SparkConf()
    conf.setAppName("SparkWordCount")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("file:///D:/word.in")
    println(rdd1.toDebugString)
    val rdd2 = rdd.flatMap(_.split(" "))
    println("rdd2:" + rdd2.toDebugString)
    val rdd3 = rdd2.map((_, 1))
    println("rdd3:" + rdd3.toDebugString)
    val rdd4 = rdd4.reduceByKey(_ + _);
    println("rdd4:" + rdd4.toDebugString)
    rdd3.saveAsTextFile("file:///D:/wordout" + System.currentTimeMillis());
    sc.stop
  }
}

 

 输出的RDD依赖图是:

RDD1

rdd1:(1) file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

RDD2

rdd2:(1) FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
 |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

RDD3

rdd3:(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
 |  FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
 |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
 |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []

 

rdd4:(1) ShuffledRDD[4] at reduceByKey at SparkWordCount.scala:21 []
 +-(1) MappedRDD[3] at map at SparkWordCount.scala:19 []
    |  FlatMappedRDD[2] at flatMap at SparkWordCount.scala:17 []
    |  file:///D:/word.in MappedRDD[1] at textFile at SparkWordCount.scala:15 []
    |  file:///D:/word.in HadoopRDD[0] at textFile at SparkWordCount.scala:15 []
 

 ResultTask的runTask方法里的func方法调用PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)

 

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context)) //调用PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)
  }

 

 

PairRDDFunctions里的writeToFile函数完成写结果操作(saveAsTextFile)

 val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
      val config = wrappedConf.value
      // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
      // around by taking a mod. We expect that no task will be attempted 2 billion times.
      val attemptNumber = (context.attemptId % Int.MaxValue).toInt

      val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context, config)

      writer.setup(context.stageId, context.partitionId, attemptNumber)
      writer.open()
      try {
        var recordsWritten = 0L
        while (iter.hasNext) {
          val record = iter.next()
          writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])

          // Update bytes written metric every few records
          maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten)
          recordsWritten += 1
        }
      } finally {
        writer.close()
      }
      writer.commit()
      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
    }
 

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值