Flink_实时热门商品统计

输入数据:UserBehavior.csv

 数据示例:

66286722440741575622pv1511658000

实现一个“实时热门商品”的需求,可以将“实时热门商品”翻译成程 序员更好理解的需求: 每隔 5 分钟输出最近一小时内点击量最多的前 N 个商品 。将 这个需求进行分解我们大概要做这么几件情:
1.  抽取出业务时间戳,告诉 Flink 框架基于业务时间做窗口
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
2. 过滤出点击行为数据
.filter(_.behavior == "pv") //只要点击行为
3.  按一小时的窗口大小,每 5 分钟统计一次,做滑动窗口聚合( Sliding Window
.keyBy(_.itemId)
.timeWindow(Time.hours(1), Time.minutes(5))
.aggregate(new CountAgg(), new ItemViewWindowResult()) //(预聚合规则,窗口内操作规则)
4. 按每个窗口聚合,输出每个窗口中点击量前 N 名的商品
.keyBy(_.windowEnd) //按照窗口分组,收集当前窗口内的商品count数据
.process(new TopNHotItems(10))  //自定义处理流程
import java.sql.Timestamp

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

// 定义输入样例类
case class UserBehavior(userId: Long,
                        itemId: Long,
                        categoryId: Int,
                        behavior: String,
                        timestamp: Long )

// 定义窗口聚合结果样例类
case class ItemViewCount(itemId: Long,
                         windowEnd: Long,
                         count: Long )


object HotItems {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    //从文件中读取数据,并转换成样例类
    val inputStream: DataStream[String] = env.readTextFile("D:\\Mywork\\workspace\\Project_idea\\UserBehaviorAnalysis0903\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")

    val dataStream = inputStream.map(data => {
      val arr: Array[String] = data.split(",")
      UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toInt, arr(3), arr(4).toLong)
    }).assignAscendingTimestamps(_.timestamp * 1000L)

    // 得到窗口聚合结果
    val aggStream: DataStream[ItemViewCount] = dataStream
      .filter(_.behavior == "pv") //只要点击行为
      .keyBy(_.itemId)
      .timeWindow(Time.hours(1), Time.minutes(5))
      .aggregate(new CountAgg(), new ItemViewWindowResult())

    val resultStream = aggStream
      .keyBy(_.windowEnd) //按照窗口分组,收集当前窗口内的商品count数据
      .process(new TopNHotItems(10))  //自定义处理流程

    dataStream.print("data")
    resultStream.print()

    env.execute("hot items")
  }
}

//==>自定义预聚合函数 [输入, 状态, 输出]  (窗口聚合规则)
class CountAgg() extends AggregateFunction[UserBehavior, Long, Long]{
  override def createAccumulator(): Long = 0L

  override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1

  override def getResult(accumulator: Long): Long = accumulator

  override def merge(a: Long, b: Long): Long = a + b // 无用
}

//==>自定义处理流程 [key, I, O]
class ItemViewWindowResult() extends WindowFunction[Long, ItemViewCount, Long, TimeWindow] {
  override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
    val itemId = key
    val windowEnd = window.getEnd
    val count = input.iterator.next()
    out.collect(ItemViewCount(itemId, windowEnd, count))
  }
}

//==>自定义处理流程 [key, I, O]
class TopNHotItems(topSize: Int) extends KeyedProcessFunction[Long, ItemViewCount, String] {
  // 先定义状态:ListState
  private var itemViewCountListState: ListState[ItemViewCount] = _

  override def open(parameters: Configuration): Unit = {
    itemViewCountListState = getRuntimeContext
      .getListState(new ListStateDescriptor[ItemViewCount]("itemViewCount-list", classOf[ItemViewCount]))
  }

  // 每来一条数据,需要的操作
  override def processElement(i: ItemViewCount, context: KeyedProcessFunction[Long, ItemViewCount, String]#Context, collector: Collector[String]): Unit = {
    // 每来一条数据,直接加入ListState
    itemViewCountListState.add(i)

    // 注册一个windowEnd + 1毫秒 之后出发定时器
    context.timerService().registerEventTimeTimer(i.windowEnd + 1)
  }

  //当定时器触发,可以认为所有窗口统计结果都已到齐,可以排序输出了
  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
    // 为了方便排序,另外定义一个Listbuffer(可排序),保存ListState里所有数据
    val allItemViewCounts: ListBuffer[ItemViewCount] = ListBuffer()
    val iter = itemViewCountListState.get().iterator()
    while (iter.hasNext){
      allItemViewCounts += iter.next()
    }

    // 清空状态,节省内存
    itemViewCountListState.clear()

    // 按照Count大小排序,取top5
    val sortedItemViewCounts = allItemViewCounts
      .sortBy(_.count)(Ordering.Long.reverse) //柯里化传隐式参数
      .take(topSize)

    // 将排名信息格式化String,便于打印输出可视化展示
    val result: StringBuilder = new StringBuilder
    result.append("窗口结束时间: ").append(new Timestamp(timestamp - 1)).append("\n")

    // 遍历结果列表中每个ItemViewCount, 输出到一行
    for (elem <- sortedItemViewCounts.indices) {
      val currentItemViewCount = sortedItemViewCounts(elem)
      result.append("NO.").append(elem + 1)
        .append(" 商品Id = ").append(currentItemViewCount.itemId)
        .append(" 热门度 = ").append(currentItemViewCount.count).append("\n")
    }

    result.append("=========================================\n\n")
    Thread.sleep(1000)
    out.collect(result.toString())
  }

}

Table API 和SQL实现热门商品 

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.{EnvironmentSettings, Slide, Table}
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row

object HotItemsWithSql {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    // 从文件中读取数据,并转换为样例类
    val inputStream: DataStream[String] = env.readTextFile("D:\\Mywork\\workspace\\Project_idea\\flink-2021\\src\\main\\resources\\UserBehavior.csv")

    val dataStream = inputStream.map { data =>
      val arr = data.split(",")
      UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toInt, arr(3), arr(4).toLong)
    }.assignAscendingTimestamps(_.timestamp * 1000L)

    //定义表执行环境
    val settings = EnvironmentSettings.newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()
    val tableEnv = StreamTableEnvironment.create(env, settings)

    //基于流创建Table
    val dataTable: Table = tableEnv.fromDataStream(dataStream, 'itemId, 'behavior, 'timestamp.rowtime as 'ts)

    //  //1. Table Api进行开窗聚合统计 【想实现TopN,只能自定义带状态函数,参照:TableAggregateFunctionTest】
    //    val aggTable: Table = dataTable
    //      .filter('behavior === "pv")
    //      .window(Slide over 1.hours every 5.minutes on 'ts as 'sw) //定义滑动窗口sw
    //      .groupBy('itemId, 'sw)
    //      .select('itemId, 'sw.end as 'windowEnd, 'itemId.count as 'cnt)
    //
    //
    //  //2. SQL实现TopN的选取
    //    tableEnv.createTemporaryView("aggtable", aggTable, 'itemId, 'windowEnd, 'cnt)
    //    val resultTable = tableEnv.sqlQuery(
    //      """
    //        |select *
    //        |from (
    //        |  select
    //        |    *,
    //        |    row_number()
    //        |      over (partition by windowEnd order by cnt desc)
    //        |      as row_num
    //        |    from aggtable )
    //        |where row_num <= 5
    //      """.stripMargin)

    // 纯SQL实现
    tableEnv.createTemporaryView("datatable", dataStream, 'itemId, 'behavior, 'timestamp.rowtime as 'ts)
    val resultSqlTable = tableEnv.sqlQuery(
      """
        |select *
        |from (
        |  select
        |    *,
        |    row_number()
        |      over (partition by windowEnd order by cnt desc)
        |      as row_num
        |    from (
        |      select
        |        itemId,
        |        hop_end(ts, interval '5' minute, interval '1' hour) as windowEnd,
        |        count(itemId) as cnt
        |      from datatable
        |      where behavior = 'pv'
        |      group by
        |        itemId,
        |        hop(ts, interval '5' minute, interval '1' hour)
        |    )
        |)
        |where row_num <= 5
      """.stripMargin)

    //resultTable.toRetractStream[Row].print()
    resultSqlTable.toRetractStream[Row].print()
    env.execute("hot items sql job")


  }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值