flink 1.10.1 java版本窗口计算案例(全量、增量、排序)

本文的基础环境可以参考flink 1.10.1 java版本wordcount演示 (nc + socket)

flink的创建计算方法有多种,根据业务需求的特点,可以进行灵活选择。

apply方法用于对窗口进行全量聚合,窗口每触发一次时,会调用一次apply方法,相当于是对窗口中的全量数据进行计算;

aggregate方法实现增量聚合,同时自定义增量聚合和窗口关闭时的数据输出;

ProcessFunction是一个低级的流处理操作,ProcessFunction可以被认为是增加了keyed state和timers功能的FlatMapFunction。ProcesseFunction可以通过RuntimeContext访问Flink中的Keyed State,通过processElement方法中的Context实例访问流元素的时间戳,以及timerServer(注册定时器),如果watermark大于等于注册定时器的时间,就会调用onTimer方法(此处相当于一个回调函数),在调用期间,所有state的范围再次限定在创建定时器的key上,从而允许定时器操作keyed state。

1. 添加依赖

<dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.47</version>
        </dependency>

2. apply全量聚合

package com.demo.realstatis;

import com.alibaba.fastjson.JSON;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;


public class HotGoodsApply {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 选择EventTime作为Flink的时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        // 设置checkPoint时间
        env.enableCheckpointing(60000);
        // 设置并行度
        env.setParallelism(1);

        DataStreamSource<String> lines = env.socketTextStream("192.168.0.181", 9000);


        SingleOutputStreamOperator<MyBehavior> process = lines.process(new ProcessFunction<String, MyBehavior>() {
            @Override
            public void processElement(String input, Context ctx, Collector<MyBehavior> out) throws Exception {

                try {

                    if (input == null || input.length() == 0) {
                        return;
                    }

                    // FastJson 会自动把时间解析成long类型的TimeStamp
                    MyBehavior behavior = JSON.parseObject(input, MyBehavior.class);
                    out.collect(behavior);
                } catch (Exception e) {
                    e.printStackTrace();
                    //TODO 记录出现异常的数据
                }
            }
        });

        //      设定延迟时间
        SingleOutputStreamOperator<MyBehavior> behaviorDSWithWaterMark =
                process.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<MyBehavior>(Time.seconds(0)) {
                    @Override
                    public long extractTimestamp(MyBehavior element) {
                        return element.timestamp;
                    }
                });

        //  某个商品,在窗口时间内,被(点击、购买、添加购物车、收藏)了多少次
        KeyedStream<MyBehavior, Tuple> keyed = behaviorDSWithWaterMark.keyBy("itemId", "type");

        WindowedStream<MyBehavior, Tuple, TimeWindow> window =
                keyed.window(SlidingEventTimeWindows.of(Time.minutes(10), Time.minutes(1)));

        //参数:输入的数据类, 输出的数据类,分组字段tuple, 窗口对象TimeWindow
        SingleOutputStreamOperator<ItemViewCount> result = window.apply(new WindowFunction<MyBehavior, ItemViewCount,
                        Tuple, TimeWindow>() {
            @Override
            public void apply(Tuple tuple, TimeWindow window, Iterable<MyBehavior> input,
                              Collector<ItemViewCount> out) throws Exception {
                //拿出分组的字段
                String itemId = tuple.getField(0);
                String type = tuple.getField(1);


                //拿出窗口的起始和结束时间
                long start = window.getStart();
                long end = window.getEnd();

                // 编写累加的逻辑
                int count = 0;

                for (MyBehavior myBehavior : input) {
                    count += 1;
                }

                //输出结果
                out.collect(ItemViewCount.of(itemId, type, start, end, count));
            }
        });


        result.print();
        env.execute("HotGoodsTopN");

    }

}

3. 增量聚合和Process

package com.demo.realstatis;

import com.alibaba.fastjson.JSON;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;


public class HotGoodsAggrAndProcess {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 选择EventTime作为Flink的时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        // 设置checkPoint时间
        env.enableCheckpointing(60000);
        // 设置并行度
        env.setParallelism(1);

        DataStreamSource<String> lines = env.socketTextStream("192.168.0.181", 9000);


        SingleOutputStreamOperator<MyBehavior> process = lines.process(new ProcessFunction<String, MyBehavior>() {
            @Override
            public void processElement(String input, Context ctx, Collector<MyBehavior> out) throws Exception {

                try {

                    if (input == null || input.length() == 0) {
                        return;
                    }

                    // FastJson 会自动把时间解析成long类型的TimeStamp
                    MyBehavior behavior = JSON.parseObject(input, MyBehavior.class);
                    out.collect(behavior);
                } catch (Exception e) {
                    e.printStackTrace();
                    //TODO 记录出现异常的数据
                }
            }
        });

        //      设定延迟时间
        SingleOutputStreamOperator<MyBehavior> behaviorDSWithWaterMark =
                process.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<MyBehavior>(Time.seconds(0)) {
                    @Override
                    public long extractTimestamp(MyBehavior element) {
                        return element.timestamp;
                    }
                });

        //  某个商品,在窗口时间内,被(点击、购买、添加购物车、收藏)了多少次
        KeyedStream<MyBehavior, Tuple> keyed = behaviorDSWithWaterMark.keyBy("itemId", "type");

        WindowedStream<MyBehavior, Tuple, TimeWindow> window =
                keyed.window(SlidingEventTimeWindows.of(Time.minutes(10), Time.minutes(1)));

        SingleOutputStreamOperator<ItemViewCount> windowAggregate = window.aggregate(new MyWindowAggFunction(),
                new MyWindowFunction());

        KeyedStream<ItemViewCount, Tuple> soredKeyed = windowAggregate.keyBy("type", "windowStart",
                "windowEnd");

        SingleOutputStreamOperator<List<ItemViewCount>> sored = soredKeyed.process(new KeyedProcessFunction<Tuple, ItemViewCount, List<ItemViewCount>>() {
            private transient ValueState<List<ItemViewCount>> valueState;


            // 要把这个时间段的所有的ItemViewCount作为中间结果聚合在一块,引入ValueState
            @Override
            public void open(Configuration parameters) throws Exception {
                ValueStateDescriptor<List<ItemViewCount>> VSDescriptor =
                        new ValueStateDescriptor<>("list-state",
                                TypeInformation.of(new TypeHint<List<ItemViewCount>>() {
                                })
                        );


                valueState = getRuntimeContext().getState(VSDescriptor);


            }


            //更新valueState 并注册定时器
            @Override
            public void processElement(ItemViewCount input, Context ctx, Collector<List<ItemViewCount>> out) throws Exception {
                List<ItemViewCount> buffer = valueState.value();
                if (buffer == null) {
                    buffer = new ArrayList<>();
                }
                buffer.add(input);
                valueState.update(buffer);
                //注册定时器,当为窗口最后的时间时,通过加1触发定时器
                ctx.timerService().registerEventTimeTimer(input.windowEnd + 1);


            }


            // 做排序操作
            @Override
            public void onTimer(long timestamp, OnTimerContext ctx, Collector<List<ItemViewCount>> out) throws Exception {


                //将ValueState中的数据取出来
                List<ItemViewCount> buffer = valueState.value();
                buffer.sort(new Comparator<ItemViewCount>() {
                    @Override
                    public int compare(ItemViewCount o1, ItemViewCount o2) {
                        //按照倒序,转成int类型
                        return -(int) (o1.viewCount - o2.viewCount);
                    }
                });
                valueState.update(null);
                out.collect(buffer);
            }
        });
        sored.print();
        env.execute("HotGoodsTopNAdv");
    }

}

4. 辅助类ItemViewCount

package com.demo.realstatis;

import java.sql.Timestamp;

public class ItemViewCount {
    public String itemId;     // 商品ID
    public String type;     // 事件类型
    public long windowStart;  // 窗口开始时间戳
    public long windowEnd;  // 窗口结束时间戳
    public long viewCount;  // 商品的点击量


    public static ItemViewCount of(String itemId, String type, long windowStart, long windowEnd, long viewCount) {
        ItemViewCount result = new ItemViewCount();
        result.itemId = itemId;
        result.type = type;
        result.windowStart = windowStart;
        result.windowEnd = windowEnd;
        result.viewCount = viewCount;
        return result;
    }


    @Override
    public String toString() {
        return "{" +
                "itemId='" + itemId + '\'' +
                "type='" + type + '\'' +
                ", windowStart=" + windowStart + " , " + new Timestamp(windowStart) +
                ", windowEnd=" + windowEnd + " , " + new Timestamp(windowEnd) +
                ", viewCount=" + viewCount +
                '}';
    }
}

5. 辅助类MyBehavior

package com.demo.realstatis;

import java.sql.Timestamp;


public class MyBehavior {
    public String userId;           // 用户ID
    public String itemId;           // 商品ID
    public String categoryId;       // 商品类目ID
    public String type;             // 用户行为, 包括("pv", "buy", "cart", "fav")
    public long timestamp;          // 行为发生的时间戳,单位秒
    public long counts = 1;


    public static MyBehavior of(String userId, String itemId, String categoryId, String type, long timestamp) {
        MyBehavior behavior = new MyBehavior();
        behavior.userId = userId;
        behavior.itemId = itemId;
        behavior.categoryId = categoryId;
        behavior.type = type;
        behavior.timestamp = timestamp;
        return behavior;
    }


    public static MyBehavior of(String userId, String itemId, String categoryId, String type, long timestamp,
                                long counts) {
        MyBehavior behavior = new MyBehavior();
        behavior.userId = userId;
        behavior.itemId = itemId;
        behavior.categoryId = categoryId;
        behavior.type = type;
        behavior.timestamp = timestamp;
        behavior.counts = counts;
        return behavior;
    }


    @Override
    public String toString() {
        return "MyBehavior{" + "userId='" + userId + '\'' + ", itemId='" + itemId + '\''
                + ", categoryId='" + categoryId + '\'' + ", type='" + type + '\''
                + ", timestamp=" + timestamp + "," + new Timestamp(timestamp)
                + "counts=" + counts + '}';
    }


    public String getUserId() {
        return userId;
    }
    public String getItemId() {
        return itemId;
    }
    public String getCategoryId() {
        return categoryId;
    }
    public String getType() {
        return type;
    }
    public long getTimestamp() {
        return timestamp;
    }
    public long getCounts() {
        return counts;
    }
}

6. 辅助类MyWindowAggFunction

package com.demo.realstatis;

import org.apache.flink.api.common.functions.AggregateFunction;

//
public class MyWindowAggFunction implements AggregateFunction<MyBehavior, Long, Long> {


    //初始化一个计数器
    @Override
    public Long createAccumulator() {
        return 0L;
    }


    //每输入一条数据就调用一次add方法
    @Override
    public Long add(MyBehavior input, Long accumulator) {
        return accumulator + input.counts;
    }


    @Override
    public Long getResult(Long accumulator) {
        return accumulator;
    }


    //只针对SessionWindow有效,对应滚动窗口、滑动窗口不会调用此方法
    @Override
    public Long merge(Long a, Long b) {
        return null;
    }
}

7. 辅助类MyWindowFunction

package com.demo.realstatis;

import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

public class MyWindowFunction implements WindowFunction<Long, ItemViewCount, Tuple, TimeWindow> {


    @Override
    public void apply(Tuple tuple, TimeWindow window, Iterable<Long> input, Collector<ItemViewCount> out) throws Exception {
        String itemId = tuple.getField(0);
        String type = tuple.getField(1);


        long windowStart = window.getStart();
        long windowEnd = window.getEnd();


        //窗口集合的结果
        Long aLong = input.iterator().next();


        //输出数据
        out.collect(ItemViewCount.of(itemId, type, windowStart, windowEnd, aLong));
    }
}

8. 测试数据

{"userId": "u001", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u002", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u003", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u003", "itemId": "p1001", "categoryId": "c11", type: "cart", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u011", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u012", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:11:11"}
{"userId": "u012", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u001", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u002", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u003", "itemId": "p1001", "categoryId": "c11", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u003", "itemId": "p1001", "categoryId": "c11", type: "cart", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u011", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u012", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:12:01"}
{"userId": "u011", "itemId": "p2222", "categoryId": "c22", type: "pv", "timestamp": "2020-03-08 11:13:01"}

9. 测试输出

{itemId='p1001'type='pv', windowStart=1583636520000 , 2020-03-08 11:02:00.0, windowEnd=1583637120000 , 2020-03-08 11:12:00.0, viewCount=3}
{itemId='p1001'type='cart', windowStart=1583636520000 , 2020-03-08 11:02:00.0, windowEnd=1583637120000 , 2020-03-08 11:12:00.0, viewCount=1}
{itemId='p2222'type='pv', windowStart=1583636520000 , 2020-03-08 11:02:00.0, windowEnd=1583637120000 , 2020-03-08 11:12:00.0, viewCount=2}
{itemId='p1001'type='cart', windowStart=1583636580000 , 2020-03-08 11:03:00.0, windowEnd=1583637180000 , 2020-03-08 11:13:00.0, viewCount=2}
{itemId='p1001'type='pv', windowStart=1583636580000 , 2020-03-08 11:03:00.0, windowEnd=1583637180000 , 2020-03-08 11:13:00.0, viewCount=6}
{itemId='p2222'type='pv', windowStart=1583636580000 , 2020-03-08 11:03:00.0, windowEnd=1583637180000 , 2020-03-08 11:13:00.0, viewCount=5}

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值