Flink使用KeyedProcessFunction求topN

需求

计算最近10秒内访问次数最多的2个url,每5秒更新一次。

自定义数据源


import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.Calendar;
import java.util.Random;

/**
 * Author:panghu
 * Date:2022-04-19
 * Description:自定义数据源,SourceFunction接口实现的数据源并行度只能设置为1
 */
public class ClickSource implements SourceFunction<Event> {
    private boolean flag = true;

    @Override
    public void run(SourceContext<Event> sourceContext) throws Exception {
        // 向下游发送数据
        Random random = new Random();
        String[] users = {"Mary", "Alice", "Bob", "Cary"};
        String[] urls = {
                "./home", "./cart", "./fav", "./prod?id=1", "./prod?id=2"
        };
        while (flag) {
            sourceContext.collect(
                    new Event(
                            users[random.nextInt(users.length)],
                            urls[random.nextInt(urls.length)],
                            Calendar.getInstance().getTimeInMillis()
                    )
            );
            // 每隔一秒生成一次数据
            Thread.sleep(1000L);
        }
    }

    @Override
    public void cancel() {
        // 通过标识位控制退出循环,中断数据源
        flag = false;
    }
}

准备POJO类

/**
 * Author:panghu
 * Date:2022-07-23
 * Description:
 */
import java.sql.Timestamp;

public class UrlViewCount {
    public String url;
    public Long count;
    public Long windowStart;
    public Long windowEnd;

    public UrlViewCount() {
    }

    public UrlViewCount(String url, Long count, Long windowStart, Long windowEnd) {
        this.url = url;
        this.count = count;
        this.windowStart = windowStart;
        this.windowEnd = windowEnd;
    }

    @Override
    public String toString() {
        return "UrlViewCount{" +
                "url='" + url + '\'' +
                ", count=" + count +
                ", windowStart=" + new Timestamp(windowStart) +
                ", windowEnd=" + new Timestamp(windowEnd) +
                '}';
    }
}

TopN实现

import com.wanshun.bigdata.chapter05.ClickSource;
import com.wanshun.bigdata.chapter05.Event;
import com.wanshun.bigdata.java.UrlViewCount;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.sql.Timestamp;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;

/**
 * Author:panghu
 * Date:2022-07-23
 * Description: 在KeyedByProcessFunction中,注册定时器统计窗口topN的数据,
 * 只有在keyedProcessFunction中才可以注册定时器
 */
public class _04KeyedProcessTopN {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        DataStreamSource<Event> streamSource = env.addSource(new ClickSource());
        // 提取时间戳,生成水位线
        SingleOutputStreamOperator<Event> streamOperator = streamSource.assignTimestampsAndWatermarks(
                WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
                        .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                            @Override
                            public long extractTimestamp(Event event, long l) {
                                return event.timeStamp;
                            }
                        })
        );
        streamOperator.print();
        // 统计每个窗口内访问次数最多的两个url
        streamOperator.keyBy(data -> data.url)
                // 窗口大小为10,每5秒输出一次
                .window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
                .aggregate(new UrlAggFunction(), new UrlProcessFunction())
                // 统计每个窗口内的url访问次数,按照窗口结束时间keyBy
                .keyBy(data -> data.windowEnd)
                .process(new TopNKeyedProcessFuntion(2))
                .print();

        env.execute();
    }

    private static class UrlAggFunction implements AggregateFunction<Event, Long, Long> {
        @Override
        public Long createAccumulator() {
            return 0L;
        }

        @Override
        public Long add(Event event, Long acc) {
            return acc + 1;
        }

        @Override
        public Long getResult(Long acc) {
            return acc;
        }

        @Override
        public Long merge(Long acc, Long acc1) {
            return null;
        }
    }

    private static class UrlProcessFunction extends ProcessWindowFunction<Long, UrlViewCount, String, TimeWindow> {
        @Override
        public void process(String key, Context context, Iterable<Long> elements, Collector<UrlViewCount> out) throws Exception {
            // 获取了累加器中每个链接的访问次数
            Long cnt = elements.iterator().next();
            // 封装成POJO类并输出
            UrlViewCount urlViewCount = new UrlViewCount(key, cnt, context.window().getStart(), context.window().getEnd());
            out.collect(urlViewCount);
        }
    }

    private static class TopNKeyedProcessFuntion extends KeyedProcessFunction<Long, UrlViewCount, String> {
        private Integer n;
        private ListState<UrlViewCount> listState;

        // topN
        public TopNKeyedProcessFuntion(Integer n) {
            this.n = n;
        }

        // open方法是KeyedProcessFunction父类中的方法,可以获取运行时状态信息
        @Override
        public void open(Configuration parameters) throws Exception {
            // 获取列表状态信息
            listState = getRuntimeContext().getListState(
                    new ListStateDescriptor<UrlViewCount>("url-count", Types.POJO(UrlViewCount.class))
            );
        }

        // 上游每来一次数据,处理一次
        @Override
        public void processElement(UrlViewCount value, Context ctx, Collector<String> out) throws Exception {
            // 数据添加到状态列表中
            listState.add(value);
            // 注册定时器,水位线代表当前时间之前的所有数据都已经到齐了,只要多等待1ms,就可以保证窗口的数据全部到齐了
            ctx.timerService().registerEventTimeTimer(ctx.getCurrentKey() + 1);
        }

        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
            // ArrayList方便排序
            ArrayList<UrlViewCount> urlViewCountArrayList = new ArrayList<>();
            for (UrlViewCount urlViewCount : listState.get()) {
                urlViewCountArrayList.add(urlViewCount);
            }
            // 清空状态列表,释放资源
            listState.clear();
            // 排序
            urlViewCountArrayList.sort(new Comparator<UrlViewCount>() {
                @Override
                public int compare(UrlViewCount o1, UrlViewCount o2) {
                    return o2.count.intValue() - o1.count.intValue();
                }
            });
            StringBuilder sb = new StringBuilder();
            sb.append("=============================================\n");
            sb.append("当前窗口结束时间为:" + new Timestamp(timestamp - 1) + "\n");
            sb.append("当前水位线为:" + new Timestamp(ctx.timerService().currentWatermark()) + "\n");
            // 取TopN
            for (int i = 0; i < n; i++) {
                sb.append("No." + (i + 1) + ": " + urlViewCountArrayList.get(i).url
                        + " count: " + urlViewCountArrayList.get(i).count
                        + "\n"
                );
            }
            sb.append("=============================================\n");
            out.collect(sb.toString());
        }

    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值