代码
package com.zjc.flow_analysis.hotitems_analysis
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.{EnvironmentSettings, Slide}
import org.apache.flink.table.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import java.sql.Timestamp
import java.util.Properties
/**
* 每隔5分钟输出最近一小时内点击量最多的前N个商品
*/
object HotItemsTableAPI {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop103:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
val inputStream = env.addSource(new FlinkKafkaConsumer[String]("hotItems", new SimpleStringSchema(), properties))
val dataStream = inputStream.map(data => {
val arrayData = data.split(",")
UserBehavior(arrayData(0).toLong, arrayData(1).toLong, arrayData(2).toLong, arrayData(3).toString,arrayData(4).toLong)
}).assignAscendingTimestamps(_.timestamp * 1000L)
val settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tableEnv = StreamTableEnvironment.create(env, settings)
// 将dataStream转为表
val dataTable = tableEnv.fromDataStream(dataStream, 'itemId, 'behavior, 'timestamp.rowtime as 'ts)
// 分组开窗增量聚合
val aggTable = dataTable.filter('behavior === "pv")
.window(Slide over 1.hours every 5.minutes on 'ts as 'sw)
.groupBy('itemId, 'sw)
.select('itemId, 'itemId.count as 'cnt, 'sw.end as 'windowEnd)
tableEnv.createTemporaryView("agg", aggTable, 'itemId, 'cnt, 'windowEnd)
val resultTalbe = tableEnv.sqlQuery(
"""
|select *
|from (
| select *,
| row_number() over(partition by windowEnd order by cnt desc) as row_num
| from agg
|)
|where row_num <= 5
|""".stripMargin
)
resultTalbe.toRetractStream[(Long, Long,Timestamp, Long)].print("result")
env.execute("商品热门统计(table api & sql实现)")
}
}
启动zookeeper和kafka集群。
启动HotItemsTableAPI.scala。
效果如下:
含义解析,比如第四条数据5118629,因为新增数据了,点击量变成2,然后它的名次恰好也为2,之前名次为2的现在变成3(先撤销【false】再新增【true】),之前名次为3的现在变成4,之前名次为4的现在变成5,之前名次为5的,现在被撤销了,false相当于表示测回的意思,true相当于新增的意思。