输入数据:UserBehavior.csv
数据示例:
662867 | 2244074 | 1575622 | pv | 1511658000 |
实现一个“实时热门商品”的需求,可以将“实时热门商品”翻译成程 序员更好理解的需求:
每隔 5 分钟输出最近一小时内点击量最多的前 N 个商品
。将 这个需求进行分解我们大概要做这么几件情:
1. 抽取出业务时间戳,告诉 Flink
框架基于业务时间做窗口
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
2. 过滤出点击行为数据
.filter(_.behavior == "pv") //只要点击行为
3. 按一小时的窗口大小,每 5
分钟统计一次,做滑动窗口聚合(
Sliding Window
)
.keyBy(_.itemId)
.timeWindow(Time.hours(1), Time.minutes(5))
.aggregate(new CountAgg(), new ItemViewWindowResult()) //(预聚合规则,窗口内操作规则)
4. 按每个窗口聚合,输出每个窗口中点击量前 N
名的商品
.keyBy(_.windowEnd) //按照窗口分组,收集当前窗口内的商品count数据
.process(new TopNHotItems(10)) //自定义处理流程
import java.sql.Timestamp
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
// 定义输入样例类
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Int,
behavior: String,
timestamp: Long )
// 定义窗口聚合结果样例类
case class ItemViewCount(itemId: Long,
windowEnd: Long,
count: Long )
object HotItems {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
//从文件中读取数据,并转换成样例类
val inputStream: DataStream[String] = env.readTextFile("D:\\Mywork\\workspace\\Project_idea\\UserBehaviorAnalysis0903\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
val dataStream = inputStream.map(data => {
val arr: Array[String] = data.split(",")
UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toInt, arr(3), arr(4).toLong)
}).assignAscendingTimestamps(_.timestamp * 1000L)
// 得到窗口聚合结果
val aggStream: DataStream[ItemViewCount] = dataStream
.filter(_.behavior == "pv") //只要点击行为
.keyBy(_.itemId)
.timeWindow(Time.hours(1), Time.minutes(5))
.aggregate(new CountAgg(), new ItemViewWindowResult())
val resultStream = aggStream
.keyBy(_.windowEnd) //按照窗口分组,收集当前窗口内的商品count数据
.process(new TopNHotItems(10)) //自定义处理流程
dataStream.print("data")
resultStream.print()
env.execute("hot items")
}
}
//==>自定义预聚合函数 [输入, 状态, 输出] (窗口聚合规则)
class CountAgg() extends AggregateFunction[UserBehavior, Long, Long]{
override def createAccumulator(): Long = 0L
override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1
override def getResult(accumulator: Long): Long = accumulator
override def merge(a: Long, b: Long): Long = a + b // 无用
}
//==>自定义处理流程 [key, I, O]
class ItemViewWindowResult() extends WindowFunction[Long, ItemViewCount, Long, TimeWindow] {
override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
val itemId = key
val windowEnd = window.getEnd
val count = input.iterator.next()
out.collect(ItemViewCount(itemId, windowEnd, count))
}
}
//==>自定义处理流程 [key, I, O]
class TopNHotItems(topSize: Int) extends KeyedProcessFunction[Long, ItemViewCount, String] {
// 先定义状态:ListState
private var itemViewCountListState: ListState[ItemViewCount] = _
override def open(parameters: Configuration): Unit = {
itemViewCountListState = getRuntimeContext
.getListState(new ListStateDescriptor[ItemViewCount]("itemViewCount-list", classOf[ItemViewCount]))
}
// 每来一条数据,需要的操作
override def processElement(i: ItemViewCount, context: KeyedProcessFunction[Long, ItemViewCount, String]#Context, collector: Collector[String]): Unit = {
// 每来一条数据,直接加入ListState
itemViewCountListState.add(i)
// 注册一个windowEnd + 1毫秒 之后出发定时器
context.timerService().registerEventTimeTimer(i.windowEnd + 1)
}
//当定时器触发,可以认为所有窗口统计结果都已到齐,可以排序输出了
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
// 为了方便排序,另外定义一个Listbuffer(可排序),保存ListState里所有数据
val allItemViewCounts: ListBuffer[ItemViewCount] = ListBuffer()
val iter = itemViewCountListState.get().iterator()
while (iter.hasNext){
allItemViewCounts += iter.next()
}
// 清空状态,节省内存
itemViewCountListState.clear()
// 按照Count大小排序,取top5
val sortedItemViewCounts = allItemViewCounts
.sortBy(_.count)(Ordering.Long.reverse) //柯里化传隐式参数
.take(topSize)
// 将排名信息格式化String,便于打印输出可视化展示
val result: StringBuilder = new StringBuilder
result.append("窗口结束时间: ").append(new Timestamp(timestamp - 1)).append("\n")
// 遍历结果列表中每个ItemViewCount, 输出到一行
for (elem <- sortedItemViewCounts.indices) {
val currentItemViewCount = sortedItemViewCounts(elem)
result.append("NO.").append(elem + 1)
.append(" 商品Id = ").append(currentItemViewCount.itemId)
.append(" 热门度 = ").append(currentItemViewCount.count).append("\n")
}
result.append("=========================================\n\n")
Thread.sleep(1000)
out.collect(result.toString())
}
}
Table API 和SQL实现热门商品
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.{EnvironmentSettings, Slide, Table}
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row
object HotItemsWithSql {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
// 从文件中读取数据,并转换为样例类
val inputStream: DataStream[String] = env.readTextFile("D:\\Mywork\\workspace\\Project_idea\\flink-2021\\src\\main\\resources\\UserBehavior.csv")
val dataStream = inputStream.map { data =>
val arr = data.split(",")
UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toInt, arr(3), arr(4).toLong)
}.assignAscendingTimestamps(_.timestamp * 1000L)
//定义表执行环境
val settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
//基于流创建Table
val dataTable: Table = tableEnv.fromDataStream(dataStream, 'itemId, 'behavior, 'timestamp.rowtime as 'ts)
// //1. Table Api进行开窗聚合统计 【想实现TopN,只能自定义带状态函数,参照:TableAggregateFunctionTest】
// val aggTable: Table = dataTable
// .filter('behavior === "pv")
// .window(Slide over 1.hours every 5.minutes on 'ts as 'sw) //定义滑动窗口sw
// .groupBy('itemId, 'sw)
// .select('itemId, 'sw.end as 'windowEnd, 'itemId.count as 'cnt)
//
//
// //2. SQL实现TopN的选取
// tableEnv.createTemporaryView("aggtable", aggTable, 'itemId, 'windowEnd, 'cnt)
// val resultTable = tableEnv.sqlQuery(
// """
// |select *
// |from (
// | select
// | *,
// | row_number()
// | over (partition by windowEnd order by cnt desc)
// | as row_num
// | from aggtable )
// |where row_num <= 5
// """.stripMargin)
// 纯SQL实现
tableEnv.createTemporaryView("datatable", dataStream, 'itemId, 'behavior, 'timestamp.rowtime as 'ts)
val resultSqlTable = tableEnv.sqlQuery(
"""
|select *
|from (
| select
| *,
| row_number()
| over (partition by windowEnd order by cnt desc)
| as row_num
| from (
| select
| itemId,
| hop_end(ts, interval '5' minute, interval '1' hour) as windowEnd,
| count(itemId) as cnt
| from datatable
| where behavior = 'pv'
| group by
| itemId,
| hop(ts, interval '5' minute, interval '1' hour)
| )
|)
|where row_num <= 5
""".stripMargin)
//resultTable.toRetractStream[Row].print()
resultSqlTable.toRetractStream[Row].print()
env.execute("hot items sql job")
}
}