一、热门实时商品统计
1.Table API实现
import java.sql.Timestamp
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.ListStateDescriptor
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
/**
* 热门实时商品统计
* 经验:当窗口长度很长,滑动距离很短时,性能会急速下降
* 因为一份数据属于多个窗口,被复制的次数太多了
* 滚动窗口不存在这个问题
*/
object TopHotItems {
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Long,
behavior: String,
timestamp: Long)
case class ItemViewCount(itemId: Long,
windowStart: Long,
windowEnd: Long,
count: Long)
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
var stream = env.readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
.map(line => {
val arr = line.split(",")
UserBehavior(arr(0).toLong,arr(1).toLong,arr(2).toLong,arr(3),arr(4).toLong * 1000L)
})
.filter(_.behavior.equals("pv"))
// 必须在分流之前插入水位线,
// 因为是离线数据集,所以只会在数据集的末尾插入一个Long类型的最大值,触发所有窗口闭合计算
.assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
.keyBy(_.itemId) // 使用商品ID分流 KeyedStream
.timeWindow(Time.hours(1),Time.minutes(5)) // 开窗 WindowedStream
.aggregate(new CountAgg, new WindowResult) // 增量聚合和全窗口聚合结合使用 DataStream
// 水位线继续复制分发下去,因为在源头插入的水位线
.keyBy(_.windowEnd)
// 同一份数据,给它打上不同的窗口信息,这样就可以销毁窗口,不影响其它窗口
.process(new TopN(3))
stream.print()
env.execute()
}
class CountAgg extends AggregateFunction[UserBehavior,Long,Long] {
override def createAccumulator(): Long = 0L
override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1
override def getResult(accumulator: Long): Long =accumulator
// 只有会话窗口(特殊时间某时间段)用到merger,
// 滚动,滑动窗口不用merger,所以a + b 用0代替也可以
override def merge(a: Long, b: Long): Long = a + b
}
/**
* 输入:Long 来自累加器的结果,窗口内(1小时)的总浏览数
* 输出: 封装的样例类
* key是Long,因为keyBy的itemId是Long类型
*
*/
class WindowResult extends ProcessWindowFunction[Long,ItemViewCount,Long,TimeWindow] {
override def process(key: Long, context: Context, elements: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
out.collect(ItemViewCount(key,context.window.getStart,context.window.getEnd,elements.head))
}
}
// 来一条元素处理一条
/**
* @param topSize
* 这窗口里面,等支流元素到齐以后,再对它进行排序。支流的元素,怎么排序的呢?
* 我们把它存到listState里面,来一条存一条,等都到齐以后,我们把列表状态里的数据,
* 全拿出来,然后排序,拿前3个
*
* // 会不会有乱序的呢?
* 不会,离线数据集,本身就排好序的,从硬盘读入内存。进行时间旅行
*
*/
class TopN(val topSize: Int) extends KeyedProcessFunction[Long,ItemViewCount,String] {
// 只针对当前key可见的
lazy val listState = getRuntimeContext.getListState(
new ListStateDescriptor[ItemViewCount]("list-state", Types.of[ItemViewCount])
)
override def processElement(value: ItemViewCount, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#Context, out: Collector[String]): Unit = {
listState.add(value)
// 不会重复注册
// 当我们水位线,超过窗口结束时间+100毫秒时,说明窗口里的统计数据都到了,做排序
// 每一条支流上,都会调用processElement,每条支流,都属于同一窗口
// 来一条数据,就会在同样的时间,注册一个事件定时器
ctx.timerService().registerEventTimeTimer(value.windowEnd + 100) // 当水位线越过定时时间的时候,触发定时器排序
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
val allItems: ListBuffer[ItemViewCount] = ListBuffer()
import scala.collection.JavaConversions._
// 将列表状态中的数据转移到内存
// 列表状态无法排序
for (item <- listState.get()) {
allItems += item
}
// 清空状态
listState.clear()
// 使用浏览量降序排列
val sortedItems = allItems.sortBy(-_.count).take(topSize)
val result = new StringBuilder
result
.append("===========================\n")
.append("窗口:" + new Timestamp(allItems.head.windowStart) + " ~~~ " + new Timestamp(allItems.head.windowEnd))
.append("\n")
// 获取下标值0,1,2
for (i <- sortedItems.indices) {
val currItem = sortedItems(i)
result
.append("No.")
.append(i+1)
.append(":")
.append(" 商品ID = ")
.append(currItem.itemId)
.append(" 浏览量 = ")
.append(currItem.count)
.append("\n")
}
result
.append("===========================\n\n\n")
Thread.sleep(1000)
out.collect(result.toString)
}
}
}
2.Flink SQL 实现
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row
/**
* 两种问题最难调:
* 1.内存泄漏
* 2.锁,多线程的竞争问题
*
* 1.引入GC,自动回收内存
* 2.竞争问题,引入函数式编程,所有变量不可变,这变量就无法修改,
* 你想用,只能拷贝,就用不着锁,分布式,加机器就ok
*
*/
object HotItemsSQL {
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Long,
behavior: String,
timestamp: Long)
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
//数据源并行度必须设为1
// 如果并行度是10,会把离线数据集,随机打散成10份,然后在每一条流上插入水位线,那肯定乱
// 流:一变多 ,多变一
env.setParallelism(1)
// 新建表环境
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
val stream = env
.readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
.map(line => {
val arr = line.split(",")
UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
})
.filter(_.behavior.equals("pv"))
// 为什么要先插入水位线,
// 因为我们要在单条流的时候,就插入水位线,然后分流,先分流再开窗,先添加key信息,再添加窗口信息
.assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
// 创建临时表
// t 是表名,rowtime是使用事件时间,as是取别名
tableEnv.createTemporaryView("t",stream,'itemId, 'timestamp.rowtime as 'ts)
// top n只有blink planner支持
// 最内部的子查询实现了:stream.keyBy(_.itemId).timeWindow(Time.hours(1), Time.minutes(5)).aggregate(new CountAgg, new WindowResult)
// 倒数第二层子查询:.keyBy(_.windowEnd).process(Sort)
// 最外层:取出前三名
var result = tableEnv
.sqlQuery(
"""
SELECT *
|FROM (
| SELECT *,
| ROW_NUMBER() OVER (PARTITION BY windowEnd ORDER BY icount DESC) as row_num
| FROM (
| SELECT itemId, count(itemId) as icount,
| HOP_END(ts, INTERVAL '5' MINUTE, INTERVAL '1' HOUR) as windowEnd
| FROM t GROUP BY itemId, HOP(ts, INTERVAL '5' MINUTE, INTERVAL '1' HOUR)
| )
|)
|WHERE row_num <= 3
|""".stripMargin)
result.toRetractStream[Row]
// 将false撤回的过滤掉,不断追加,撤回,每条数据来都会更新
.filter(_._1 == true)
.print()
env.execute()
}
}
二、实时统计用户访问量
1.优化前的,去重占用很大空间
import java.sql.Timestamp
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
// uv: unique visitor
// 有多少用户访问过网站;pv按照userid去重
// 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿
// 大数据去重的唯一解决方案:布隆过滤器
// 布隆过滤器的组成:bit数组,哈希函数
// 相同用户名的要去掉
object UV {
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Long,
behavior: String,
timestamp: Long)
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val stream = env
.readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
.map(line => {
val arr = line.split(",")
UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
})
.filter(_.behavior.equals("pv"))
.assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
// 这个只有一条流,41-43等价于下面的
// .timeWindowAll(Time.hours(1))
.map(r => ("key",r.userId))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.process(new UvProcessFunc)
// 只要print,基本上打印的都是DataStream,
// WindowedStream,你开窗以后是打印不出来的
// 开窗,就必须在这窗口把它做进行聚合,计算。把它聚合成一条DataStream,不然,
stream.print()
env.execute()
}
/**
* 全窗口聚合函数,(窗口结束后计算,整个大窗口,所有元素一起计算)
*/
class UvProcessFunc extends ProcessWindowFunction[(String,Long),String,String,TimeWindow] {
override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = {
// 将迭代器的集合转移到Set,再利用Set将userId去重
var s:Set[Long] = Set()
for (ele <- elements) {
s += ele._2
}
out.collect("窗口" + new Timestamp(context.window.getStart) + "---" + new Timestamp(context.window.getEnd) + "的UV数是:" + s.size)
}
}
}
2.优化后,使用布隆过滤器过滤
优化原理:pv按照userid去重
// 大数据去重的唯一解决方案:布隆过滤器
import java.sql.Timestamp
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis
// uv: unique visitor
// 有多少用户访问过网站;pv按照userid去重
// 滑动窗口:窗口长度1小时,滑动距离5秒钟,每小时用户数量1亿
// 大数据去重的唯一解决方案:布隆过滤器
// 布隆过滤器的组成:bit数组,哈希函数
object vByBloomFilter {
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Long,
behavior: String,
timestamp: Long)
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val stream = env
.readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
.map(line => {
val arr = line.split(",")
UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
})
.filter(_.behavior.equals("pv"))
.assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
.map(r => ("key",r.userId))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.trigger(new UvTrigger)
.process(new UvProcessFunc)
stream.print()
env.execute()
}
class UvTrigger extends Trigger[(String,Long),TimeWindow] {
// 来一条元素调用一次
override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
// 来一个事件,就触发一次窗口计算,并清空窗口
TriggerResult.FIRE_AND_PURGE
}
override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
TriggerResult.CONTINUE
}
override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
if (ctx.getCurrentWatermark >= window.getEnd) {
val jedis = new Jedis("hadoop103", 6379)
val windowEnd = window.getEnd.toString
println(new Timestamp(windowEnd.toLong), jedis.hget("UvCount", windowEnd))
TriggerResult.FIRE_AND_PURGE // 为保险起见
}
TriggerResult.CONTINUE
}
override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}
}
class UvProcessFunc extends ProcessWindowFunction[(String, Long), String, String, TimeWindow] {
// 连接到redis
lazy val jedis = new Jedis("hadoop103", 6379)
override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = {
// 窗口结束时间 ==> UV数
// 窗口结束时间 ==> bit数组
// 拿到key
val windowEnd = context.window.getEnd.toString
var count = 0L
if (jedis.hget("UvCount", windowEnd) != null) {
count = jedis.hget("UvCount", windowEnd).toLong
}
// 迭代器中只有一条元素,因为每来一条元素,窗口清空一次,见trigger
val userId = elements.head._2.toString
// 计算userId对应的bit数组的下标
val idx = hash(userId, 1 << 29)
// 判断userId是否访问过
if (!jedis.getbit(windowEnd, idx)) { // 对应的bit为0的话,返回false,用户一定没访问过
jedis.setbit(windowEnd, idx, true) // 将idx对应的bit翻转为1
jedis.hset("UvCount", windowEnd, (count + 1).toString)
}
}
}
// 为了方便理解,只实现一个哈希函数,返回值是Long,bit数组的下标
// value: 字符串;size:bit数组的长度
def hash(value: String, size: Long): Long = {
val seed = 61 // 种子
var result = 0L
for (i <- 0 until value.length) {
result = result * seed + value.charAt(i)
}
(size - 1) & result
}
}
三、实时对账,双流Join
import org.apache.flink.api.common.state.ValueStateDescriptor
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.co.CoProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
object TwoStreamsJoin {
// 订单支付事件
case class OrderEvent(orderId: String,
eventType: String,
eventTime: Long)
// 第三方支付事件,例如微信,支付宝
case class PayEvent(orderId: String,
eventType: String,
eventTime: Long)
// 用来输出没有匹配到的订单支付事件
val unmatchedOrders = new OutputTag[String]("unmatched-orders")
// 用来输出没有匹配到的第三方支付事件
val unmatchedPays = new OutputTag[String]("unmatched-pays")
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val orders = env
.fromElements(
OrderEvent("order_1", "pay", 2000L),
OrderEvent("order_2", "pay", 5000L),
OrderEvent("order_3", "pay", 6000L)
)
// 分流之前要先插入水位线
.assignAscendingTimestamps(_.eventTime)
.keyBy(_.orderId)
val pays = env
.fromElements(
PayEvent("order_1", "weixin", 7000L),
PayEvent("order_2", "weixin", 8000L),
PayEvent("order_4", "weixin", 9000L)
)
// 分流之前要先插入水位线
.assignAscendingTimestamps(_.eventTime)
.keyBy(_.orderId)
val processed = orders
.connect(pays)
.process(new MatchFunction)
processed.print()
processed.getSideOutput(new OutputTag[String]("unmatched-orders")).print()
processed.getSideOutput(new OutputTag[String]("unmatched-pays")).print()
env.execute()
}
class MatchFunction extends CoProcessFunction[OrderEvent,PayEvent,String] {
// 用来保存到来的订单支付事件
lazy val orderState = getRuntimeContext.getState(
new ValueStateDescriptor[OrderEvent]("order-state",classOf[OrderEvent])
)
// 用来保存到来的第三方支付事件
lazy val payState = getRuntimeContext.getState(
new ValueStateDescriptor[PayEvent]("pay-state",classOf[PayEvent])
)
override def processElement1(order: OrderEvent, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#Context, out: Collector[String]): Unit = {
val pay = payState.value()
// pay 和 order 的订单ID是一样的
if (pay != null) {
payState.clear()
out.collect("订单ID为 " + order.orderId + " 的两条流对账成功!")
} else {
orderState.update(order)
ctx.timerService().registerEventTimeTimer(order.eventTime + 5000L)
}
}
// 处理第三方支付流的
override def processElement2(pay: PayEvent, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#Context, out: Collector[String]): Unit = {
val order = orderState.value()
if (order != null) {
orderState.clear()
out.collect("订单ID为 " + pay.orderId + " 的两条流对账成功!")
} else {
payState.update(pay)
ctx.timerService().registerEventTimeTimer(pay.eventTime + 5000L)
}
}
/**
*
* @param timestamp
* @param ctx 只有CEP的侧输出标签用 out.collect输出
* 其它用ctx.output 输出侧输出标签,
* @param out out.collect 一般用于收集数据向下游输出,主流的都用这个输出,侧输出流的不一定
*/
override def onTimer(timestamp: Long, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#OnTimerContext, out: Collector[String]): Unit = {
if (orderState.value() != null) {
ctx.output(unmatchedOrders, "订单ID为 " + orderState.value().orderId + " 的两条流没有对账成功!")
orderState.clear()
}
if (payState.value() != null) {
ctx.output(unmatchedPays, "订单ID为 " + payState.value().orderId + " 的两条流没有对账成功!")
payState.clear()
}
}
}
}