flink之topN的多个解法

方法一:在keyBy之前开窗(使用ProcessAllWindowFunction) 

/**
 * 需求:求TopN
 *      实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
 * 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
 * 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
 *
 * 方式一:在keyBy之前开窗,把不同url放到同一窗口
 * 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
 */

import com.sandra.bean.Event;
import com.sandra.day03.Flink01_Source_Customer;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;


/**
 * 需求:求TopN
 *      实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
 * 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
 * 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
 *
 * 方式一:在keyBy之前开窗,把不同url放到同一窗口
 * 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
 */
public class Flink07_TopN {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        //用自定义的source随机产生event
        SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction())
                //指定watermark并分配时间戳
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        //指定watermark
                        .<Event>forBoundedOutOfOrderness(Duration.ofSeconds(3))
                        //分配事件时间戳
                        .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                            @Override
                            public long extractTimestamp(Event element, long recordTimestamp) {
                                return element.timestamp;//ms
                            }
                        })
                );

        //提取里面的url
        SingleOutputStreamOperator<String> urlStream = streamSource.map(new MapFunction<Event, String>() {
            @Override
            public String map(Event value) throws Exception {
                return value.url;
            }
        });

        //统计最近10s内最热门的两个url链接,并且每5s更新一次(范围:10s->窗口大小,频率:5s->滑动步长)
        //todo 在keyby之前开窗(这样不同的url就可以在同一个窗口里作比较了)
      urlStream.windowAll(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
                .process(new TopNProcessAllWindowFun(2))
                .print();

      env.execute();




    }

    public static class TopNProcessAllWindowFun extends ProcessAllWindowFunction<String, Tuple2<String,Integer>, TimeWindow>{

        private Integer topn;
        public TopNProcessAllWindowFun(Integer topN){
            this.topn=topN;
        }


        @Override
        public void process(Context context, Iterable<String> elements, Collector<Tuple2<String, Integer>> out) throws Exception {
            //一个窗口创建一个hashMap
            HashMap<String, Integer> hashMap = new HashMap<>();
            Iterator<String> iterator = elements.iterator();
            while (iterator.hasNext()){
                String url = iterator.next();
                if (hashMap.containsKey(url)){
                    hashMap.put(url,hashMap.get(url)+1);
                }else {
                    //url第一次来
                    hashMap.put(url,1);
                }
            }

            //遍历hashMap中的数据进行排序
            //创建List集合用来排序
            ArrayList<Tuple2<String, Integer>> url2CountList = new ArrayList<>();

            //将数据都取出来放到List集合中以便调用List的sort进行排序
            for (String url : hashMap.keySet()) {
                url2CountList.add(Tuple2.of(url,hashMap.get(url)));
            }

            //对该list按照浏览量从大到小进行排序
            url2CountList.sort(new Comparator<Tuple2<String, Integer>>() {
                @Override
                public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
                    return o2.f1-o1.f1;
                }
            });

            //取出浏览量最高的url

            for (int i = 0; i < topn; i++) {
                out.collect(url2CountList.get(i));
            }
        }
    }
}

 

 

自定义source:

package com.sandra.day03;

import com.sandra.bean.Event;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;

import java.util.Random;

//自定义Source
public class Flink01_Source_Customer {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        //通过自定义source数据源获取数据
        DataStreamSource<Event> streamSource = env.addSource(new MySourceFunction()).setParallelism(2);

        streamSource.print();

        env.execute();

    }


    //自定义类实现SourceFunction接口,在里面随机生成Event,默认并行度只能设置为1
//    public static class MySourceFunction implements SourceFunction<Event>{

    //如果想要实现多并行度,需要实现ParallelSourceFunction接口
    public static class MySourceFunction implements ParallelSourceFunction<Event> {

//        private Random random = new Random();//如果写在这个位置,多并行度下每个时间段产生的数据是一样的
        private Boolean isRunning = true;

        /**
         * 该方法主要用来生成数据或者获取其他数据库数据,并且在这个方法中通常有一个while循环
         * @param ctx 可以利用上下文对象的collect方法将生成或者获取的数据发送至下游
         * @throws Exception
         */
        @Override
        public void run(SourceContext<Event> ctx) throws Exception {
            Random random = new Random(); //写在这个位置,多并行度下每个时间段产生的数据是不一样的
            String[] users = {"sandra","wuli"};
            String[] urls = {"./home","./cart","./flink","./java"};

            while (isRunning){
                ctx.collect(new Event(users[random.nextInt(users.length)],urls[random.nextInt(urls.length)],System.currentTimeMillis()));
//                Thread.sleep(500);
            }
        }

        /**
         * 因为在run方法中通常有一个while循环,可以通过该方法终止,这个方法不是我们手动调用,是系统内部自动调用
         */
        @Override
        public void cancel() {
            isRunning = false;
        }
    }

}

方法二:keyBy一个常量再开窗(使用KeyedProcessFunction)


import com.sandra.bean.Event;
import com.sandra.day03.Flink01_Source_Customer;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.kafka.common.protocol.types.Field;

import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;


/**
 * 需求:求TopN
 *      实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
 * 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
 * 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
 *
 * 方式一:在keyBy之前开窗,把不同url放到同一窗口
 * 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
 */
public class Flink07_TopN2 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        //用自定义的source随机产生event
        SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction())
                //指定watermark并分配时间戳
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        //指定watermark
                        .<Event>forBoundedOutOfOrderness(Duration.ofSeconds(3))
                        //分配事件时间戳
                        .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                            @Override
                            public long extractTimestamp(Event element, long recordTimestamp) {
                                return element.timestamp;//ms
                            }
                        })
                );

        //提取里面的url
        SingleOutputStreamOperator<String> urlStream = streamSource.map(new MapFunction<Event, String>() {
            @Override
            public String map(Event value) throws Exception {
                return value.url;
            }
        });

        //todo keyBy一个常量然后再开窗计算
        KeyedStream<String, String> keyedStream = urlStream.keyBy(new KeySelector<String, String>() {
            @Override
            public String getKey(String value) throws Exception {
                return "1";
            }
        });


        WindowedStream<String, String, TimeWindow> windowedStream
                = keyedStream.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)));

        windowedStream.process(new ProcessWindowFunction<String, Tuple2<String,Integer>, String, TimeWindow>() {
            @Override
            public void process(String s, Context context, Iterable<String> elements, Collector<Tuple2<String, Integer>> out) throws Exception {
                HashMap<String, Integer> hashMap = new HashMap<>();//存放(url,浏览量)
                ArrayList<Tuple2<String, Integer>> list = new ArrayList<>();

                //求(url,浏览量)
                for (String url : elements) {
                    if (hashMap.containsKey(url)){
                        hashMap.put(url,hashMap.get(url)+1);
                    }else {
                        hashMap.put(url,1);
                    }
                }

                //比较不同url的浏览量,取浏览量前2的(url,浏览量)
                for (String url : hashMap.keySet()) {
                    list.add(Tuple2.of(url,hashMap.get(url)));
                }

                //将(url,浏览量)按照从大到小排序
                list.sort(new Comparator<Tuple2<String, Integer>>() {
                    @Override
                    public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
                        return o2.f1-o1.f1;
                    }
                });

                //取出浏览量前2的(url,浏览量)
                for (int i = 0; i < 2; i++) {
                    out.collect(list.get(i));

                }


            }
        }).print();


        env.execute();
    }}

 方法三:(最优解)keyby url,开窗求各自url浏览量(该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常,可自行解决)

public class Flink07_TopN4 {
    public static void main(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.setParallelism(3);

        DataStreamSource<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction());

        //将相同url分到一个组里
        KeyedStream<String, String> urlKeyedStream = streamSource.assignTimestampsAndWatermarks(WatermarkStrategy
                .<Event>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                    @Override
                    public long extractTimestamp(Event element, long recordTimestamp) {
                        return element.timestamp;
                    }
                })).map(new MapFunction<Event, String>() {
            @Override
            public String map(Event value) throws Exception {
                return value.url;
            }
        }).keyBy(new KeySelector<String, String>() {
            @Override
            public String getKey(String url) throws Exception {
                return url;
            }
        });

        //对相同url进行开窗求浏览量
        SingleOutputStreamOperator<Tuple4<String, Integer, Long, Long>> urlPvStream = urlKeyedStream.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
                //用增量聚合函数提前进行聚合,减少state的存储压力,并结合全窗口函数将数据包装成tuple4<url,浏览量,窗口开始时间,窗口结束时间>
                .aggregate(new AggregateFunction<String, Tuple2<String, Integer>, Tuple2<String, Integer>>() {
                    @Override
                    public Tuple2<String, Integer> createAccumulator() {
                        return Tuple2.of("", 0);
                    }

                    @Override
                    public Tuple2<String, Integer> add(String url, Tuple2<String, Integer> accumulator) {
                        return Tuple2.of(url, accumulator.f1 + 1);
                    }

                    @Override
                    public Tuple2<String, Integer> getResult(Tuple2<String, Integer> accumulator) {
                        return accumulator;
                    }

                    @Override
                    public Tuple2<String, Integer> merge(Tuple2<String, Integer> a, Tuple2<String, Integer> b) {
                        return null;
                    }
                }, new ProcessWindowFunction<Tuple2<String, Integer>, Tuple4<String, Integer, Long, Long>, String, TimeWindow>() {
                    @Override
                    public void process(String s, Context context, Iterable<Tuple2<String, Integer>> elements, Collector<Tuple4<String, Integer, Long, Long>> out) throws Exception {
                        Tuple2<String, Integer> next = elements.iterator().next();//迭代器里只有一条数据
                        out.collect(Tuple4.of(next.f0, next.f1, context.window().getStart(), context.window().getEnd()));
                    }
                });

        //将相同窗口的不同url及其携带的信息(浏览量,窗口起止时间)聚合到一起
        KeyedStream<Tuple4<String, Integer, Long, Long>, Long> windowStream = urlPvStream.keyBy(new KeySelector<Tuple4<String, Integer, Long, Long>, Long>() {
            @Override
            public Long getKey(Tuple4<String, Integer, Long, Long> value) throws Exception {
                return value.f3;//窗口开始时间或结束时间都可以
            }
        });

        //将相同窗口的不同url的浏览量进行从大到小排序,取出前两名
        windowStream.process(new KeyedProcessFunction<Long, Tuple4<String, Integer, Long, Long>, String>() {

            private ListState<Tuple4> listState;

            @Override
            public void open(Configuration parameters) throws Exception {
                listState = getRuntimeContext().getListState(new ListStateDescriptor<Tuple4>("listState", Types.TUPLE(Types.STRING, Types.INT, Types.LONG, Types.LONG)));
            }

            @Override
            public void processElement(Tuple4<String, Integer, Long, Long> value, Context ctx, Collector<String> out) throws Exception {
                //来一条数据收集一条,直到定时器时间到了对该窗口的所有数据按照浏览量进行排序
                listState.add(value);
                //注册定时器,确保定时器到了该窗口所有数据都收集齐了
                ctx.timerService().registerEventTimeTimer(value.f3+1);//窗口的结束时间+1ms,这里的1ms表示等待所有分区数据到最后一个算子任务的总时间

            }

            //定时器触发时对窗口里的数据排序求topN
            @Override
            public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
                //listState是没有sort方法的,要把状态集合里面的数据都取出来放到list集合里,再调用sort方法
                ArrayList<Tuple4> list = new ArrayList<>();
                Iterable<Tuple4> tuple4s = listState.get();
                for (Tuple4 tuple4 : tuple4s) {
                    list.add(tuple4);
                }

                //清除状态释放资源
                listState.clear();

                //按照浏览量从大到小排序
                list.sort(new Comparator<Tuple4>() {
                    @Override
                    public int compare(Tuple4 o1, Tuple4 o2) {
                        return (int)o2.f1-(int)o1.f1;
                    }
                });

                //取出top2方式一(
该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常)
//                for (int i = 0; i < 2; i++) {
//                    out.collect("浏览量第" + (i+1) + "的url为" + list.get(i).f0 +",访问次数为"+list.get(i).f1+",所属窗口为:["+list.get(i).f2+","+list.get(i).f3+")");
//
//                }

                //取出top2方式二(
该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常)
                StringBuilder stringBuilder = new StringBuilder();
                stringBuilder.append("======================\n");

                Long windowStart = 0L;
                Long windowEnd = 0L;

                for (int i = 0; i < 2; i++) {
                    Tuple4 tuple4 = list.get(i);
                    windowStart=(Long) tuple4.f2;
                    windowEnd=(Long) tuple4.f3;

                    stringBuilder.append(" 浏览量第" + (i+1) + "的url为"+tuple4.f0)
                            .append("访问次数为:" + tuple4.f1)
                           ;
                }

                stringBuilder .append("所属窗口为:[" +windowStart + "," +windowEnd+ ")")
                        .append("\n");

                out.collect(stringBuilder.toString());


            }
        }).print();

        try {
            env.execute();
        } catch (Exception e) {
            e.printStackTrace();
        }


    }
}

方法四:自己造窗口(自己算窗口开始时间和定时器触发时间)需求改动一下:求每5s热门top2访问的url

import com.sandra.bean.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;

import java.time.Duration;

/**
 * 需求:模拟一个滚动窗口,我们要计算的是每一个url在每一个窗口中的pv数据,我们之前在day05:Flink05_TimeWindow_TumblingWindow_Aggre_Process
 * 使用增量聚合和全窗口结合的方式实现过这个需求,这里用MapState来实现一下
 */
public class Flink11_KeyedState_MapState {
    public static void main(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink02_Source_Customer_WithWaterMark1.MySourceFunction())
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        .<Event>forBoundedOutOfOrderness(Duration.ofSeconds(0))
                        .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                            @Override
                            public long extractTimestamp(Event element, long recordTimestamp) {
                                return element.timestamp;
                            }
                        })
                );

        streamSource.keyBy(new KeySelector<Event, String>() {
            @Override
            public String getKey(Event value) throws Exception {
                return value.url;
            }
        })
        /**
         *自定义一个窗口,求5s内该url访问量
         * 1.窗口大小为5s
         * 2.相同url要放在一起计算
         *  2.1在第2个条件的前提下,相同url的相同窗口的数据要一起计算
         *
         *  明确:哪些相同url的窗口是同一个窗口,可以对数据过来之后依据数据本身的时间计算出属于这个数据
         *  的开始时间,计算方式可以参考窗口源码中的计算窗口开始时间公式,这样的话,数据相同窗口的数据
         *  计算出来的开始时间一定是相同的,又因为要对相同窗口中的相同url做计数操作。所以在属于这个窗口
         *  的每条数据过来之后,要结合之前计算过的结果进行个数的计算。
         *
         *  通过以上分析,我们要把每个窗口之前计算过的结果保存起来,为了能够知道取得是哪个窗口的计算结果
         *  可以以(k,v)->(窗口开始时间,该url该窗口浏览量)的方式保存,因此可以使用MapState,又因为MapState是分key,而key为url,因此这样
         *  就满足了相同url,相同窗口的pv计算
         */

                .process(new MyFakeWindow(5000)).print();//5s的滚动窗口

        try {
            env.execute();
        } catch (Exception e) {
            e.printStackTrace();
        }


    }

    private static class MyFakeWindow extends KeyedProcessFunction<String,Event,String> {
        //声明一个mapState(k,v)->(窗口开始时间,该url该窗口浏览量)
        private MapState<Long,Integer> mapState;
        private Integer windowSize;
        public MyFakeWindow(Integer windowSize){
            this.windowSize=windowSize;
        }

        @Override
        public void open(Configuration parameters) throws Exception {
            mapState = getRuntimeContext().getMapState(new MapStateDescriptor<Long, Integer>("map-state",Long.class,Integer.class));

        }




        @Override
        public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
            //来一条数据获取它属于哪个窗口,求该窗口的开始时间和结束时间
//            //偏移量为0也可以这么写因为2%5和7%5结果是一样的
//            Long windowStart = value.timestamp - value.timestamp % windowSize;
            Long windowStart = value.timestamp - (value.timestamp - 0 + windowSize) % windowSize;
            Long windowEnd = windowStart + windowSize;

            //如果状态里有这个窗口,状态里该ur该窗口浏览量+1(模拟一个增量聚合函数,来一条计算一条)
            if (mapState.contains(windowStart)){
                mapState.put(windowStart,mapState.get(windowStart) + 1);
            }else{
                mapState.put(windowStart,1);
            }

            //注册一个定时器,用来控制窗口的触发计算并关闭
            ctx.timerService().registerEventTimeTimer(windowEnd - 1);//比如窗口为[0,5),定时器触发时间为4999ms

        }

        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
            //获取窗口开始时间
            Long windowStart = timestamp + 1 -windowSize;//timestamp:4999ms
            Long windowEnd = windowStart + windowSize;
            out.collect("窗口:[" + windowStart + "," + windowEnd + ") ===>pv:" + mapState.get(windowStart));

            //触发器触发除了意味着触发窗口计算之外,还要关窗
            mapState.remove(windowStart);

        }
    }
}

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值