Spark Streaming结合Spark SQL开发案例：电商中不同类别中最热门的商品排名

最新推荐文章于 2021-11-22 23:16:56 发布
Liu-Lv
最新推荐文章于 2021-11-22 23:16:56 发布
阅读量513
点赞数
分类专栏： spark
本文链接：https://blog.csdn.net/u010077555/article/details/78328346
版权
spark 专栏收录该内容
6 篇文章 0 订阅
订阅专栏
object StreamingCombineSqlTmp {
  def main(args: Array[String]): Unit = {
    /**
      * 创建SparkConf
      */
    val conf = new SparkConf().setAppName(this.getClass.getName)

    /**
      * 设置batchDuration时间间隔来控制Job生成的频率并且创建Spark Streaming执行的入口
      *
      * 参数说明：
      * Seconds(5) 流数据的时间间隔将被分成若干组 数据流分割成批的时间间隔
      * 为什么要这个参数？ sparkStreaming是微批的处理，把流数据分割为非常小的时间间隔分成一批一批的数据
      * 时间间隔设置为5，那么每5秒会从数据源创建一个RDD,最小值可以配置500毫秒
      */
    val batchInterval = 5
    /**
      * 通过SparkConf创建StreamingContext
      */
    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    /**
      * 原：
      * Set the context to periodically checkpoint the DStream operations for driver
      * 设置上下文，定期检查驱动程序的DStream操作
      * 容错系统
      * 参数：directory
      * HDFS-compatible directory where the checkpoint data will be reliably stored.
      * 与hdfs兼容的目录，其中检查点数据将被可靠地存储。
      * Note that this must be a fault-tolerant file system like HDFS for
      * 注意，这必须是一个容错的文件系统，比如HDFS
      *
      * 检查点：
      * 它会上spark streaming定定期创建检查点数据，参数是一个目录的名称，这个目录是hdfs目录
      */
    ssc.checkpoint("/root/Documents/SparkApps?checkpoint")

    /**
      * 输出源：
      *
      * 从TCP源套接字连接接收数据流的DStream
      *
      * 创建DStream 返回ReceiverInputDStream[String]
      *
      * (参数 主机名:端口）
      * 主机名：数据源的主机名， 端口：接收数据的连接的端口号
      *
      * 模拟机制：通过nc
      * nc -lk 8888
      * 如果linux没有nc可以安装一个yum install nc
      */
    val userClickLogsStream = ssc.socketTextStream("hadoop01", 8888)
    //val dstreamForHdfs = ssc.textFileStream("hdfs//:8020/tmp/dsteam")

    /**
      * 返回 MappedDStream[T, U] 备：MappedDStream[T, U]是DStream[U]的子类
      * 读取数据流并进行map转换操作返回DStream[U]
      *
      * 如源数据：假设说这里的数据的格式：user item category，例如Rocky Samsung Android
      * 那么这里返回的数据格式 (Android_Samsung, 1)
      *
      */
    /*val formattedUserClickLogsDStream = userClickLogsStream.map(clickLog => {
    ( clickLog.split(" ")(2) + "_" +  clickLog.split(" ")(1), 1)
     })*/
    val formattedUserClickLogsDStream = userClickLogsStream.map(clickLog => {
      val clickLogSplitArr = clickLog.split(" ")
      if (clickLogSplitArr.length >= 3) {
        (clickLogSplitArr(2) + "_" + clickLogSplitArr(1), 1)
      } else {
        println("-------------------------------------------------------------------------clickLogSplitArr.length less 3")
        null
      }
    })

    /**
      * DStream的转换操作
      * reduceByKeyAndWindow函数：进行窗口计算，时间窗口是60s,每隔20s更新一次滑动一次,中间可以复用40s效率比较高
      * _ + _在list上加上新的， _ - _然后减去旧的
      *
      * (
      * reduceFunc: (V, V) => V,
      * windowDuration: Duration
      * )
      * 返回 DStream[(K, V)] 得出任意一种类型的商品在过去60s点击多少次
      */
    /*val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow((v1:Int, v2:Int) => v1+v2,
      (v1:Int, v2:Int) => v1-v2 , Seconds(60), Seconds(20))*/
    val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow(_ + _, _ - _, Seconds(60), Seconds(20))

    /**
      * DStream调用foreachRDD转换为RDD进行操作
      * 数据格式 (Android_Samsung, n)
      */
    if (categoryUserClickLogsDStream == null) {
      return
    }
    categoryUserClickLogsDStream.foreachRDD(rdd =>
      if (rdd == null || rdd.isEmpty()) {
        print("No data inputted")
      } else {
        /**
          * 构建DataFram需要的RDD[ROW]一行的数据
          */
        val categoryItemRow = rdd.map(reducedItem => {
          val category = reducedItem._1.split("_")(0)
          val item = reducedItem._1.split("_")(1)
          val click_count = reducedItem._2
          Row(category, item, click_count)
        })

        /**
          * 构建DataFrame需要 schema: StructType
          * dataFrame每一列的类型
          */
        val structType = StructType(Array(
          StructField("category", StringType, nullable = true),
          StructField("item", StringType, nullable = true),
          StructField("click_count", IntegerType, nullable = true)
        ))

        /**
          * 创建HiveContext
          */
        val hiveContext = new HiveContext(rdd.context)
        /**
          * 根据RDD[ROW]和StruckType构建DataFrame
          */
        val categoryItemDF = hiveContext.createDataFrame(categoryItemRow, structType)

        /**
          * DataFrame注册为一个临时表，可以通过SparkSql去操作
          */
        categoryItemDF.registerTempTable("categoryItemTable")

        /**
          * Spark sql
          */
        val resultDataFrame = hiveContext.sql("SELECT category,item,click_count FROM (SELECT category,item,click_count,row_number()" +
          "OVER(PARTITION BY category ORDER BY click_count DESC) rank FROM categoryItemTable) subquery " +
          "WHERE rank <= 3")

        /**
          * 展示DataFrame前20行的表格式的数据
          */
        resultDataFrame.show()

        /**
          * DataFrame的rdd方法，返回其RDD[ROW]
          */
        val resultRowRDD = resultDataFrame.rdd

        /**
          * 循环resultRowRDD （RDD[ROW]）遍历写入Oracle的categorytop3表中
          */
        if (resultRowRDD == null || resultRowRDD.isEmpty()) {
          println("resultDataFrame.rdd is null ")
        } else {
          resultRowRDD.foreachPartition { partitionOfRecords => {

            if (partitionOfRecords == null || partitionOfRecords.isEmpty) {
              println("this is RDD is not null but partition is null")
            } else {
              val connection = ConnectionPool.getConnection
              partitionOfRecords.foreach(record => {
                val sql = "insert into categorytop3(category,item,client_count) values('" + record.getAs("category") + "','" +
                  record.getAs("item") + "'," + record.getAs("click_count") + ")"
                if (connection != null) {
                  val stmt = connection.createStatement()
                  val sql1 = new String(sql.getBytes, "UTF-8")
                  stmt.executeUpdate(sql1)
                }
              })
              if (connection != null)
                ConnectionPool.returnConnection(connection)
            }
          }
          }
        }

      }
    )

    ssc.start()

    ssc.awaitTermination()
  }
}