基于案例贯通Spark Streaming流计算框架的运行源码

最新推荐文章于 2018-12-12 14:13:02 发布

xiaonaughty

最新推荐文章于 2018-12-12 14:13:02 发布

阅读量365

点赞数

分类专栏： SparkStreaming 文章标签： spark 源码

本文链接：https://blog.csdn.net/xiaonaughty/article/details/51483649

版权

SparkStreaming 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

案例代码：

package com.dt.spark.sparkstreaming
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * 使用Spark Streaming+Spark SQL来在线动态计算电商中不同类别中最热门的商品排名，例如手机这个类别下面最热门的三种手机、电视这个类别
  * 下最热门的三种电视，该实例在实际生产环境下具有非常重大的意义；
  *
  * @author DT大数据梦工厂
  * 新浪微博：http://weibo.com/ilovepains/
  *
  *
  *   实现技术：Spark Streaming+Spark SQL，之所以Spark Streaming能够使用ML、sql、graphx等功能是因为有foreachRDD和Transform
  * 等接口，这些接口中其实是基于RDD进行操作，所以以RDD为基石，就可以直接使用Spark其它所有的功能，就像直接调用API一样简单。
  *  假设说这里的数据的格式：user item category，例如Rocky Samsung Android
  */
object OnlineTheTop3ItemForEachCategory2DB {
  def main(args: Array[String]){
    val conf = new SparkConf() //创建SparkConf对象
    conf.setAppName("OnlineTheTop3ItemForEachCategory2DB") //设置应用程序的名称，在程序运行的监控界面可以看到名称
//    conf.setMaster("spark://Master:7077") //此时，程序在Spark集群
    conf.setMaster("local[6]")
    //设置batchDuration时间间隔来控制Job生成的频率并且创建Spark Streaming执行的入口
    val ssc = new StreamingContext(conf, Seconds(5))

    ssc.checkpoint("/root/Documents/SparkApps/checkpoint")


    val userClickLogsDStream = ssc.socketTextStream("Master", 9999)

    val formattedUserClickLogsDStream = userClickLogsDStream.map(clickLog =>
        (clickLog.split(" ")(2) + "_" + clickLog.split(" ")(1), 1))

//    val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow((v1:Int, v2: Int) => v1 + v2,
//      (v1:Int, v2: Int) => v1 - v2, Seconds(60), Seconds(20))

    val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow(_+_,
      _-_, Seconds(60), Seconds(20))

    categoryUserClickLogsDStream.foreachRDD { rdd => {
      if (rdd.isEmpty()) {
        println("No data inputted!!!")
      } else {
        val categoryItemRow = rdd.map(reducedItem => {
          val category = reducedItem._1.split("_")(0)
          val item = reducedItem._1.split("_")(1)
          val click_count = reducedItem._2
          Row(category, item, click_count)
        })

        val structType = StructType(Array(
          StructField("category", StringType, true),
          StructField("item", StringType, true),
          StructField("click_count", IntegerType, true)
        ))

        val hiveContext = new HiveContext(rdd.context)
        val categoryItemDF = hiveContext.createDataFrame(categoryItemRow, structType)

        categoryItemDF.registerTempTable("categoryItemTable")

        val reseltDataFram = hiveContext.sql("SELECT category,item,click_count FROM (SELECT category,item,click_count,row_number()" +
          " OVER (PARTITION BY category ORDER BY click_count DESC) rank FROM categoryItemTable) subquery " +
          " WHERE rank <= 3")
        reseltDataFram.show()

        val resultRowRDD = reseltDataFram.rdd

        resultRowRDD.foreachPartition { partitionOfRecords => {

          if (partitionOfRecords.isEmpty){
            println("This RDD is not null but partition is null")
          } else {
            // ConnectionPool is a static, lazily initialized pool of connections
            val connection = ConnectionPool.getConnection()
            partitionOfRecords.foreach(record => {
              val sql = "insert into categorytop3(category,item,client_count) values('" + record.getAs("category") + "','" +
                record.getAs("item") + "'," + record.getAs("click_count") + ")"
              val stmt = connection.createStatement();
              stmt.executeUpdate(sql);

            })
            ConnectionPool.returnConnection(connection) // return to the pool for future reuse

          }
        }
        }
      }
    }
    }
    ssc.start()
    ssc.awaitTermination()
  }
}

代码分析：
1. 创建StreamingContext

val ssc = new StreamingContext(conf, Seconds(5))

/**
 * Create a StreamingContext by providing the details necessary for creating a new SparkContext.
 * @param master cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]).
 * @param appName a name for your job, to display on the cluster web UI
 * @param batchDuration the time interval at which streaming data will be divided into batches
 */
def this(
    master: String,
    appName: String,
    batchDuration: Duration,
    sparkHome: String = null,
    jars: Seq[String] = Nil,
    environment: Map[String, String] = Map()) = {
  this(StreamingContext.createNewSparkContext(master, appName, sparkHome, jars, environment),
       null, batchDuration)
}

StreamingContext在构建的时候会通过配置conf在内部构建SparkContext,说明SparkStreaming就是Spark上的一个应用程序

class StreamingContext private[streaming] (
    sc_ : SparkContext,
    cp_ : Checkpoint,
    batchDur_ : Duration
  ) extends Logging {

private[streaming] def createNewSparkContext(conf: SparkConf): SparkContext = {
  new SparkContext(conf)
}

2.创建socket输入流

val userClickLogsDStream = ssc.socketTextStream("Master", 9999)

def socketTextStream(
    hostname: String,
    port: Int,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
  ): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
  socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
}

创建SocketInputDStream，返回的是 ReceiverInputDStream类型

def socketStream[T: ClassTag](
    hostname: String,
    port: Int,
    converter: (InputStream) => Iterator[T],
    storageLevel: StorageLevel
  ): ReceiverInputDStream[T] = {
  new SocketInputDStream[T](this, hostname, port, converter, storageLevel)
}

基于SocketReceiver接收数据

class SocketInputDStream[T: ClassTag](
    ssc_ : StreamingContext,
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends ReceiverInputDStream[T](ssc_) {

  def getReceiver(): Receiver[T] = {
    new SocketReceiver(host, port, bytesToObjects, storageLevel)
  }
}

SocketReceiver的onStart方法开启了了一个线程，在线程里面run时，调用receive方法

class SocketReceiver[T: ClassTag](
    host: String,
    port: Int,
    bytesToObjects: InputStream => Iterator[T],
    storageLevel: StorageLevel
  ) extends Receiver[T](storageLevel) with Logging {

  def onStart() {
    // Start the thread that receives data over a connection
    new Thread("Socket Receiver") {
      setDaemon(true)
      override def run() { receive() }
    }.start()
  }

receive根据ip和port连接socket,不断循环接收数据，以hashmap的方式存储

def receive() {
  var socket: Socket = null
  try {
    logInfo("Connecting to " + host + ":" + port)
    socket = new Socket(host, port)
    logInfo("Connected to " + host + ":" + port)
    val iterator = bytesToObjects(socket.getInputStream())
    while(!isStopped && iterator.hasNext) {
      store(iterator.next)
    }

3.继承结构
Ctrl+H查看：
Dstream–>InputDStream–>ReceiverInputDStream–>SocketInputDStream
InputDStream中有start和stop方法，具体子类SocketInputDStream中onstart方法是被回调的，ReceiverSupervisor调用的就是start方法
Dstream中有ForEachDstream，输出流，所有Dstream级别的action输出操作，都会有ForEachDStream的产生，里面会创建新job

class ForEachDStream[T: ClassTag] (
    parent: DStream[T],
    foreachFunc: (RDD[T], Time) => Unit,
    displayInnerRDDOps: Boolean
  ) extends DStream[Unit](parent.ssc) {

  override def generateJob(time: Time): Option[Job] = {
    parent.getOrCompute(time) match {
      case Some(rdd) =>
        val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
          foreachFunc(rdd, time)
        }
        Some(new Job(time, jobFunc))
      case None => None
    }
  }
}

Dstream.scala中，根据时间间隔生成RDD，hashmap中key是batch duration，value是RDD

private[streaming] var generatedRDDs = new HashMap[Time, RDD[T]] ()

getOrCompute生成RDD,放在hashmap中

private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
...
}

4.ssc.start()
JobScheduler是在一个新的线程中启动

def start(): Unit = synchronized {
  state match {
    case INITIALIZED =>
      startSite.set(DStream.getCreationSite())
      StreamingContext.ACTIVATION_LOCK.synchronized {
        StreamingContext.assertNoOtherContextIsActive()
        try {
          validate()

          // Start the streaming scheduler in a new thread, so that thread local properties
          // like call sites and job groups can be reset without affecting those of the
          // current thread.
          ThreadUtils.runInNewThread("streaming-start") {
            sparkContext.setCallSite(startSite.get)
            sparkContext.clearJobGroup()
            sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
            scheduler.start()
          }
          state = StreamingContextState.ACTIVE

JobScheduler.scala

private var eventLoop: EventLoop[JobSchedulerEvent] = null

def start(): Unit = synchronized {
  if (eventLoop != null) return // scheduler has already been started

  logDebug("Starting JobScheduler")
  eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
    override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

    override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
  }
  eventLoop.start()

  // attach rate controllers of input streams to receive batch completion updates
  for {
    inputDStream <- ssc.graph.getInputStreams
    rateController <- inputDStream.rateController
  } ssc.addStreamingListener(rateController)

  listenerBus.start(ssc.sparkContext)
  receiverTracker = new ReceiverTracker(ssc)
  inputInfoTracker = new InputInfoTracker(ssc)
  receiverTracker.start()
  jobGenerator.start()
  logInfo("Started JobScheduler")
}

listenerBus监听所有event
ReceiverTracker处理数据接收，数据缓存，Block生成等，以发送Job的方式到集群中的Executor去启动receiver，内部使用ReceiverTrackEndpoint接收来自Receiver的消息。
JobScheduler是job的调度器，用一个线程循环，监听不同的job：启动JobStarted，完成JobCompleted，失败ErrorReported

private def processEvent(event: JobSchedulerEvent) {
  try {
    event match {
      case JobStarted(job, startTime) => handleJobStart(job, startTime)
      case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
      case ErrorReported(m, e) => handleError(m, e)
    }
  } catch {
    case e: Throwable =>
      reportError("Error in job scheduler", e)
  }
}

xiaonaughty

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
基于案例贯通Spark Streaming流计算框架的运行源码

案例代码：package com.dt.spark.sparkstreamingimport org.apache.spark.SparkConfimport org.apache.spark.sql.Rowimport org.apache.spark.sql.hive.HiveContextimport org.apache.spark.sql.types.{IntegerType, S
复制链接

扫一扫

专栏目录