1.为什么要学习底层 ProcessFuntion API
- 可以单独给流中每条数据单独做处理
- 可以获取watermark 并注册定时事件
- 可以获取状态,将处理的结果保存起来
- 总结: 其他API上层做不了的事都可以使用ProcessFuntion API
2.Flink提供了哪些ProcessFuntion以及调用时间
- ProcessFunction
- KeyedProcessFunction: keyBy分组后调用
- CoProcessFunction
- ProcessJoinFunction
join两条流
CoProcessFunction提供了操作每一个输入流的方法: processElement1()processElement2()
这两种方法都通过Context对象来调用。这个Context对象可以访问事件数据,定时器时间戳,TimerService,以及side outputs。CoProcessFunction也提供了onTimer()回调函数。
实现低阶join通常遵循此套路:
1.为一个(或两个)输入创建一个状态对象。
2.当从输入源收到元素时,更新状态。
3.从另一个输入接收元素后,检索状态并生成连接的结果。
- BroadcastProcessFunction
- KeyedBroadcastProcessFunction
- ProcessWindowFunction: 开窗之后调用
- ProcessAllWindowFunction: 全窗口后调用
3.应用案例
- KeyedProcessFunction的应用
/**
* 统计一个小时内的热门商品 5分钟更新一次
* 输出结果:
* 窗口结束时间:2017-11-26 12:20:00.0
* 窗口内容:
* NO 1: 商品ID = 2338453 热门度 = 27
* NO 2: 商品ID = 812879 热门度 = 18
* NO 3: 商品ID = 4443059 热门度 = 18
* NO 4: 商品ID = 3810981 热门度 = 14
* NO 5: 商品ID = 2364679 热门度 = 14
* 思路:
* 1.既然统计一个小时内 且5分钟更新一次的结果
* 定义滑动窗口:窗口大小为1 hour,步长为5min
* 定义增量聚合函数 拿到相同商品的此时
* 2.定义全窗口函数 拿到窗口的截止时间
* <p>
* 3.状态编程 根据窗口结束时间keyBy
* 定义定时器
* 定时器结束后输出所有状态
*/
public class HotItems {
public static void main(String[] args) throws Exception {
// 1. 创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
DataStream<String> inputPath = env.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/UserBehavior.csv");
//2 逻辑处理 输出数据 itemid 数量 窗口结束时间
// 2.1 开窗 聚合
DataStream<ItemViewCount> dataStream = inputPath.map(line -> {
String[] split = line.split(",");
return new ItemBean(new Long(split[0]), new Long(split[1]), new Integer(split[2]), split[3], new Long(split[4]));
}).filter(itemBean -> "pv".equals(itemBean.getBehavior()))
.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ItemBean>() {
@Override
public long extractAscendingTimestamp(ItemBean itemBean) {
return itemBean.getTimestamp() * 1000L;
}
}).keyBy("itemId")
.timeWindow(Time.hours(1), Time.minutes(5))
/**
* 结合使用
* ReduceFunction/AggregateFunction和ProcessWindowFunction结合使用,分配到某个窗口的元素将被提前聚合
* 而当窗口的trigger触发时,也就是窗口收集完数据关闭时,将会把聚合结果发送到ProcessWindowFunction中,这时Iterable参数将会只有一个值,就是前面聚合的值。
*/
.aggregate(new MyAggreateFunction(), new MyAllWinAggreateFunction());
// 2.2保存数据状态
SingleOutputStreamOperator<String> waterEnd = dataStream.keyBy("windowEnd").process(new TopNHotItems(5));
waterEnd.print();
env.execute();
}
/**
* 窗口增量聚合函数 求窗口内的个数
*/
public static class MyAggreateFunction implements AggregateFunction<ItemBean, Long, Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ItemBean itemBean, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return null;
}
}
/**
* 全窗口函数: 在增量聚合之后 使用全窗口函数 封装item_id 浏览次数 窗口结束时间
*/
public static class MyAllWinAggreateFunction implements WindowFunction<Long, ItemViewCount, Tuple, TimeWindow> {
@Override
public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<ItemViewCount> out) throws Exception {
Long itemId = tuple.getField(0);
long windowEnd = window.getEnd();
Long count = input.iterator().next();
out.collect(new ItemViewCount(itemId, windowEnd, count));
}
}
public static class TopNHotItems extends KeyedProcessFunction<Tuple, ItemViewCount, String> {
ListState<ItemViewCount> hotItems;
long triggersTs = 0;
// 定义属性,top n的大小
private Integer topSize;
public TopNHotItems(Integer topSize) {
this.topSize = topSize;
}
/**
* 对象创建后 先调用这个方法
*
* @param configuration
* @throws Exception
*/
@Override
public void open(Configuration configuration) throws Exception {
hotItems = getRuntimeContext().getListState(new ListStateDescriptor<ItemViewCount>("hot_items", ItemViewCount.class));
}
/**
* 每条数据来执行的逻辑
* @param value
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processElement(ItemViewCount value, Context ctx, Collector<String> out) throws Exception {
hotItems.add(value);
// 定时器源码:eventTimeTimersQueue.add(new TimerHeapInternalTimer(Timer))
ctx.timerService().registerEventTimeTimer(value.getWindowEnd() + 1);
}
/**
* 到达窗结束时间 输出所有状态
*
*触发事件源码: timer.getTimestamp<=time =========> 定时时间<=watermark
*/
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// 取出状态所有数据
ArrayList<ItemViewCount> itemViewCounts = Lists.newArrayList(hotItems.get().iterator());
// 按照降序排序
itemViewCounts.sort(new Comparator<ItemViewCount>() {
@Override
public int compare(ItemViewCount o1, ItemViewCount o2) {
return o2.getCount().intValue() - o1.getCount().intValue();
}
});
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("===================================\n");
stringBuilder.append("窗口结束时间:").append(new Timestamp(timestamp - 1)).append("\n");
for (int i = 0; i < Math.min(topSize, itemViewCounts.size()); i++) {
ItemViewCount currentItemViewCount = itemViewCounts.get(i);
stringBuilder.append("NO ").append(i + 1).append(":")
.append(" 商品ID = ").append(currentItemViewCount.getItemId())
.append(" 热门度 = ").append(currentItemViewCount.getCount())
.append("\n");
}
stringBuilder.append("===============================\n\n");
// 控制输出频率
// Thread.sleep(2000L);
out.collect(stringBuilder.toString());
}
}
}
- ProcessAllWindowFunction
/**
* @author :LiangFangWei
* @date: 2021-12-21 15:55
* - _ooOoo_
* - o8888888o
* - 88" . "88
* - (| -_- |)
* - O\ = /O
* - ____/`---'\____
* - . ' \\| |// `.
* - / \\||| : |||// \
* - / _||||| -:- |||||- \
* - | | \\\ - /// | |
* - | \_| ''\---/'' | |
* - \ .-\__ `-` ___/-. /
* - ___`. .' /--.--\ `. . __
* - ."" '< `.___\_<|>_/___.' >'"".
* - | | : `- \`.;`\ _ /`;.`/ - ` : | |
* - \ \ `-. \_ __\ /__ _/ .-` / /
* ======`-.____`-.___\_____/___.-`____.-'======
* .............................................
* - 佛祖保佑 永无BUG
* <p>
* 需求:实时输出统计每个小时内的的uv。每个小时内用户去重数实时输出
* 思路:
* <p>
* 再一小时的时间窗口内,每来一条数据 触发计算 。
* 计算逻辑:
* 1.取当前数据去redis的位图中查有没有
* 查询的key为 时间窗口的结束时间
* 查询的offset为 userID的hash值
* 2. 如果没有给查询的位置 置为1
* <p>
* 取判读redis的位图中有没有
* 如果有丢弃 如果没有 count+1 将新的值存到redis中
*/
public class HotUVWithBloomFilter {
public static void main(String[] args) throws Exception {
//1.环境准备
StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
executionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
executionEnvironment.setParallelism(1);
// 2. 准备数据
DataStreamSource<String> inputStream = executionEnvironment.readTextFile("/Users/liangfangwei/IdeaProjects/flinkUserAnalays/data_file/UserBehavior.csv");
SingleOutputStreamOperator<ItemBean> filterData = inputStream.map(line -> {
String[] split = line.split(",");
return new ItemBean(Long.parseLong(split[0]), Long.parseLong(split[1]), Integer.parseInt(split[2]), split[3], Long.parseLong(split[4]));
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ItemBean>() {
@Override
public long extractAscendingTimestamp(ItemBean element) {
return element.getTimestamp() * 1000L;
}
}).filter(itemBean -> "pv".equals(itemBean.getBehavior()));
//2.滚动窗口为1小时
SingleOutputStreamOperator<PageViewCount> streamOperator = filterData
.timeWindowAll(Time.hours(1))
//3.定义触发器 需要定义每来一条数据触发计算 而不是等全部的窗口再触发计算
.trigger(new UVTriigger())
// 4 计算逻辑 去redis的位图查是否有没有当前userID
.process(new UVProcessFunction());
// 5 如果没有则 需要插入进去
streamOperator.print();
executionEnvironment.execute();
}
/**
* 定义静态内部类 不需要将类的定义额外写在class文件中
*/
public static class UVTriigger extends Trigger<ItemBean, TimeWindow> {
@Override
public TriggerResult onElement(ItemBean element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
return TriggerResult.FIRE_AND_PURGE;
}
@Override
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
return TriggerResult.CONTINUE;
}
@Override
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
return TriggerResult.CONTINUE;
}
@Override
public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
}
}
public static class UVProcessFunction extends ProcessAllWindowFunction<ItemBean, PageViewCount, TimeWindow> {
private Jedis jedis;
private String pageCountKey = "uv_page_count";
private BloomFilter bloomFilter;
@Override
public void open(Configuration parameters) throws Exception {
jedis = new Jedis("localhost", 6379);
bloomFilter = new BloomFilter(1 << 29);
}
/**
* 来一条数据去redis中查
*
* @param context
* @param elements
* @param out
* @throws Exception
*/
@Override
public void process(Context context, Iterable<ItemBean> elements, Collector<PageViewCount> out) throws Exception {
Long windowEnd1 = context.window().getEnd();
String windowEnd = windowEnd1.toString();
ItemBean itemBean = elements.iterator().next();
Long userId = itemBean.getUserId();
long offset = bloomFilter.hash(userId.toString(), 61);
Boolean isExist = jedis.getbit(windowEnd, offset);
if (!isExist) {
jedis.setbit(windowEnd, offset, true);
// count值+1 cont值存储为hash结构
Long uvCount = 0L; // 初始count值
String uvCountString = jedis.hget(pageCountKey, windowEnd);
if (StringUtils.isNoneBlank(uvCountString)) {
uvCount = Long.valueOf(uvCountString);
}
jedis.hset(pageCountKey, windowEnd, String.valueOf(uvCount + 1));
out.collect(new PageViewCount("uv", windowEnd1, uvCount + 1));
}
}
}
public static class BloomFilter {
// 要去2的幂次方 result&(capacity-1) 才是求余的
private long capacity;
public BloomFilter(long capacity) {
this.capacity = capacity;
}
public long hash(String userId, int seed) {
long result = 0L;
for (int i = 0; i < userId.length(); i++) {
result = result * seed + userId.charAt(i);
}
return result & (capacity - 1);
}
}
}
- CoProcessFunction
/**
*整体的逻辑思路是:
* 流1先来,先把流1保存进流1的状态;
* 流2先来,先把流2保存进流2的状态;
* 再注册一个60s的定时器,如果60s内流2来了,则把两个流连接发送下游;如果60内流2没有来,则把流 * 1数据测流输出
* 流2的处理逻辑也是这样。
* 另外再加一个定时器的状态,用于清除定时器,因为60s内如果另一个流数据来的话,此时已经不需要定时器了,及时删除定时器。所以这里用了一个状态标志定时器。*/
// 流1 要先按照id分组
DataStreamSource<String> sourceStream1 = env.addSource(consumer);
KeyedStream<String, Tuple> stream1 = sourceStream1.keyBy(1);
// 流2 要先按照id分组
DataStreamSource<String> sourceStream2 = env.addSource(consumer);
KeyedStream<String, Tuple> stream2 = sourceStream1.keyBy(1);
// 定义两个侧切流的outputTag
OutputTag<String> outputTag1 = new OutputTag<>("stream1");
OutputTag<String> outputTag2 = new OutputTag<>("stream2");
stream1.connect(stream2).process(new CoProcessFunction<String, String, Tuple2<String, String>>() {
// 流1的状态
ValueState<String> state1;
// 流2的状态
ValueState<String> state2;
// 定义一个用于删除定时器的状态
ValueState<Long> timeState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
// 初始化状态
state1 = getRuntimeContext().getState(new ValueStateDescriptor<>("state1", String.class));
state2 = getRuntimeContext().getState(new ValueStateDescriptor<>("state2", String.class));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<>("timeState", Long.class));
}
// 流1的处理逻辑
@Override
public void processElement1(String value, Context ctx, Collector<Tuple2<String, String>> out) throws Exception {
String value2 = state2.value();
// 流2不为空表示流2先来了,直接将两个流拼接发到下游
if (value2 != null) {
out.collect(Tuple2.of(value, value2));
// 清空流2对用的state信息
state2.clear();
// 流2来了就可以删除定时器了,并把定时器的状态清除
ctx.timerService().deleteEventTimeTimer(timeState.value());
timeState.clear();
} else {
// 流2还没来,将流1放入state1中,
state1.update(value);
// 并注册一个1分钟的定时器,流1中的 eventTime + 60s
long time = 1111L + 60000;
timeState.update(time);
ctx.timerService().registerEventTimeTimer(time);
}
}
// 流2的处理逻辑与流1的处理逻辑类似
@Override
public void processElement2(String value, Context ctx, Collector<Tuple2<String, String>> out) throws Exception {
String value1 = state1.value();
if (value1 != null) {
out.collect(Tuple2.of(value1, value));
state1.clear();
ctx.timerService().deleteEventTimeTimer(timeState.value());
timeState.clear();
} else {
state2.update(value);
long time = 1111L + 60000;
timeState.update(time);
ctx.timerService().registerEventTimeTimer(time);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Tuple2<String, String>> out) throws Exception {
super.onTimer(timestamp, ctx, out);
// 定时器触发了,即1分钟内没有收到两个流
// 流1不为空,则将流1侧切输出
if (state1.value() != null) {
ctx.output(outputTag1, state1.value());
}
// 流2不为空,则将流2侧切输出
if (state2.value() != null) {
ctx.output(outputTag2, state2.value());
}
state1.clear();
state2.clear();
}
});