Flink 案例2-基于log日志统计浏览量统计

Flink 案例2-基于log日志统计浏览量统计

需求:每隔 5 秒,输出最近 1 分钟内访问量最多的前 N 个 URL

1 pom

   <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.13.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>1.13.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.11</artifactId>
            <version>1.13.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.11</artifactId>
            <version>1.13.0</version>
        </dependency>

    </dependencies>

2 log数据

208.115.111.72 - - 17/05/2015:11:05:29 +0000 GET /scripts/grok-py-test/
208.115.111.72 - - 17/05/2015:11:05:08 +0000 GET /?N=A&page=21
208.115.111.72 - - 17/05/2015:11:05:49 +0000 GET /blog/geekery/oniguruma-named-capture-example.html?commentlimit=0
208.115.111.72 - - 17/05/2015:11:05:01 +0000 GET /blog/geekery/ssh-key-invalid-hack.html?commentlimit=0
208.115.111.72 - - 17/05/2015:11:05:31 +0000 GET /blog/geekery/server-side-javascript.html
208.115.111.72 - - 17/05/2015:11:05:15 +0000 GET /blog/geekery/yahoo-hackday-08.html
105.235.130.196 - - 17/05/2015:11:05:01 +0000 GET /images/googledotcom.png
174.37.205.76 - - 17/05/2015:11:05:19 +0000 GET /blog
54.255.13.204 - - 17/05/2015:11:05:03 +0000 GET /articles/ssh-security/
105.235.130.196 - - 17/05/2015:11:05:45 +0000 GET /blog/tags/X11
54.255.13.204 - - 17/05/2015:11:05:55 +0000 GET /reset.css
54.255.13.204 - - 17/05/2015:11:05:32 +0000 GET /style2.css
54.255.13.204 - - 17/05/2015:11:05:10 +0000 GET /favicon.ico
105.235.130.196 - - 17/05/2015:11:05:20 +0000 GET /reset.css
54.255.13.204 - - 17/05/2015:11:05:46 +0000 GET /images/jordan-80.png
54.255.13.204 - - 17/05/2015:11:05:17 +0000 GET /images/web/2009/banner.png
105.235.130.196 - - 17/05/2015:11:05:47 +0000 GET /style2.css
105.235.130.196 - - 17/05/2015:11:05:37 +0000 GET /images/jordan-80.png
105.235.130.196 - - 17/05/2015:11:05:22 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2015:11:05:01 +0000 GET /projects/xdotool/
134.76.249.10 - - 17/05/2015:11:05:09 +0000 GET /reset.css
134.76.249.10 - - 17/05/2015:11:05:57 +0000 GET /style2.css
134.76.249.10 - - 17/05/2015:11:05:23 +0000 GET /favicon.ico
134.76.249.10 - - 17/05/2015:11:05:40 +0000 GET /images/jordan-80.png
134.76.249.10 - - 17/05/2015:11:05:50 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2015:11:05:47 +0000 GET /projects/xdotool
134.76.249.10 - - 17/05/2015:11:05:13 +0000 GET /projects/xdotool/
66.249.73.135 - - 17/05/2015:11:05:26 +0000 GET /?flav=atom
207.241.237.220 - - 17/05/2015:11:05:24 +0000 GET /blog/tags/C?page=2
68.184.202.186 - - 17/05/2015:11:05:28 +0000 GET /projects/xpathtool/
68.184.202.186 - - 17/05/2015:11:05:02 +0000 GET /reset.css
68.184.202.186 - - 17/05/2015:11:05:05 +0000 GET /images/jordan-80.png
68.184.202.186 - - 17/05/2015:11:05:02 +0000 GET /style2.css
68.184.202.186 - - 17/05/2015:11:05:37 +0000 GET /images/web/2009/banner.png
68.184.202.186 - - 17/05/2015:11:05:58 +0000 GET /favicon.ico
46.105.14.53 - - 17/05/2015:11:05:29 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2015:11:05:00 +0000 GET /?flav=rss20
24.233.162.179 - - 17/05/2015:11:05:31 +0000 GET /favicon.ico
123.125.71.117 - - 17/05/2015:11:05:16 +0000 GET /
220.181.108.153 - - 17/05/2015:11:05:09 +0000 GET /
65.19.138.34 - - 17/05/2015:11:05:40 +0000 GET /
66.249.73.135 - - 17/05/2015:11:05:32 +0000 GET /blog/geekery/rhapsody-on-linux.html
97.116.185.190 - - 17/05/2015:11:05:59 +0000 GET /articles/dynamic-dns-with-dhcp/
97.116.185.190 - - 17/05/2015:11:05:39 +0000 GET /reset.css
97.116.185.190 - - 17/05/2015:11:05:29 +0000 GET /style2.css
97.116.185.190 - - 17/05/2015:11:05:39 +0000 GET /images/jordan-80.png
97.116.185.190 - - 17/05/2015:11:05:02 +0000 GET /images/web/2009/banner.png
97.116.185.190 - - 17/05/2015:11:05:35 +0000 GET /favicon.ico
5.255.72.168 - - 17/05/2015:11:05:21 +0000 GET /
5.255.72.168 - - 17/05/2015:11:05:08 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
46.105.14.53 - - 17/05/2015:11:05:33 +0000 GET /blog/tags/puppet?flav=rss20
5.102.173.71 - - 17/05/2015:11:05:13 +0000 GET /robots.txt
5.102.173.71 - - 17/05/2015:11:05:06 +0000 GET /projects/xdotool/
208.91.156.11 - - 17/05/2015:11:05:05 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
66.249.73.185 - - 17/05/2015:11:05:58 +0000 GET /presentations/logstash-1/
74.125.176.81 - - 17/05/2015:11:05:28 +0000 GET /?flav=rss20
66.249.73.135 - - 17/05/2015:11:05:14 +0000 GET /blog/geekery/xdotool-2.20110530.html
187.45.193.158 - - 17/05/2015:11:05:54 +0000 GET /presentations/logstash-1/file/about-me/tequila-face.jpg
90.220.199.149 - - 17/05/2015:11:05:18 +0000 GET /blog/geekery/puppet-manage-homedirectory-contents.html
90.220.199.149 - - 17/05/2015:11:05:24 +0000 GET /reset.css
90.220.199.149 - - 17/05/2015:11:05:50 +0000 GET /style2.css
90.220.199.149 - - 17/05/2015:12:05:37 +0000 GET /images/jordan-80.png
90.220.199.149 - - 17/05/2015:12:05:21 +0000 GET /images/web/2009/banner.png
90.220.199.149 - - 17/05/2015:12:05:17 +0000 GET /favicon.ico
36.38.8.174 - - 17/05/2015:12:05:24 +0000 GET /blog/geekery/ssl-latency.html
36.38.8.174 - - 17/05/2015:12:05:36 +0000 GET /reset.css
36.38.8.174 - - 17/05/2015:12:05:14 +0000 GET /style2.css
36.38.8.174 - - 17/05/2015:12:05:44 +0000 GET /images/jordan-80.png
36.38.8.174 - - 17/05/2015:12:05:17 +0000 GET /images/web/2009/banner.png
36.38.8.174 - - 17/05/2015:12:05:39 +0000 GET /favicon.ico
71.207.12.53 - - 17/05/2015:12:05:17 +0000 GET /favicon.ico
220.241.45.142 - - 17/05/2015:12:05:07 +0000 GET /robots.txt
220.241.45.142 - - 17/05/2015:12:05:30 +0000 GET /projects/firefox-tabsearch/
209.85.238.199 - - 17/05/2015:12:05:21 +0000 GET /?flav=atom
46.105.14.53 - - 17/05/2015:12:05:53 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2015:12:05:28 +0000 GET /blog/tags/noise
146.1.1.2 - - 17/05/2015:12:05:51 +0000 GET /blog/geekery/ssl-latency.html
146.1.1.2 - - 17/05/2015:12:05:18 +0000 GET /style2.css
146.1.1.2 - - 17/05/2015:12:05:55 +0000 GET /reset.css
146.1.1.2 - - 17/05/2015:12:05:24 +0000 GET /favicon.ico

3 code

 public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 设置时间时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        DataStream<ApacheLogDto> dataStream = env.readTextFile("D:\\linxq\\mime\\code\\flink\\src\\main\\resources\\LogTest.csv")
                .flatMap(new MyFlatMapFunction())
                .assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ApacheLogDto>() {
                    @Override
                    public long extractAscendingTimestamp(ApacheLogDto apacheLogDto) {
                        return apacheLogDto.getTimestamp();
                    }
                });

        // 分组开窗计算
        SingleOutputStreamOperator<BehaviorUriOutWindow> aggStream = dataStream.keyBy(ApacheLogDto::getUrl) // 按url分组
                .timeWindow(Time.seconds(60), Time.seconds(5)) // 开窗 长度60s 每5s滑动一次
                .aggregate(new CountAgg(), new AggResultWinFunction());// 聚合计算


        // 按窗口分区 计算访问量高前N个URI
        aggStream.keyBy(BehaviorUriOutWindow::getEndTime).process(new MyUrlKeyProcessFunction(2))
                .print("hot url:").setParallelism(1); // 输出

        env.execute();
    }


    /**
     * String 转 dto
     * in: 79.185.184.23 - - 17/05/2015:12:05:31 +0000 GET /reset.css
     */
    public static class MyFlatMapFunction implements FlatMapFunction<String, ApacheLogDto> {

        private SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");

        private String regex = "^((?!\\.(css|js|png|ico|html|txt|jar|jpg)$).)*$";

        @Override
        public void flatMap(String s, Collector<ApacheLogDto> collector) throws Exception {
            String[] split = s.split(" ");
            ApacheLogDto apacheLogDto = new ApacheLogDto();
            apacheLogDto.setIpAddr(split[0]);
            apacheLogDto.setTimestamp(simpleDateFormat.parse(split[3]).getTime());
            apacheLogDto.setMethod(split[5]);
            apacheLogDto.setUrl(split[6].split("\\?")[0]); // 切割问号
            // 过滤css js png ico html txt jpg后缀的url
            if (Pattern.matches(regex, apacheLogDto.getUrl())) {
                collector.collect(apacheLogDto);
            }
        }
    }


    /**
     * 聚合计算
     */
    public static class CountAgg implements AggregateFunction<ApacheLogDto, Integer, Integer> {

        // 初始值
        @Override
        public Integer createAccumulator() {
            return 0;
        }

        // 聚合逻辑
        @Override
        public Integer add(ApacheLogDto apacheLogDto, Integer acc) {
            return acc + 1;
        }

        // 结果值
        @Override
        public Integer getResult(Integer acc) {
            return acc;
        }

        // 合并操作
        @Override
        public Integer merge(Integer acc, Integer acc1) {
            return acc + acc1;
        }
    }

    /**
     * 窗口关闭 转换
     */
    public static class AggResultWinFunction implements WindowFunction<Integer, BehaviorUriOutWindow, String, TimeWindow> {

        @Override
        public void apply(String key, TimeWindow timeWindow, Iterable<Integer> iterable, Collector<BehaviorUriOutWindow> collector) throws Exception {
            // 获取当前窗口聚合计算结果
            Integer aggCount = iterable.iterator().next();
            BehaviorUriOutWindow outDto = new BehaviorUriOutWindow();
            outDto.setUrl(key);
            outDto.setAggCount(aggCount);
            outDto.setEndTime(timeWindow.getEnd());
            collector.collect(outDto);
        }
    }


    /**
     * 计算访问量高前N个URI并输出
     */
    public static class MyUrlKeyProcessFunction extends KeyedProcessFunction<Long, BehaviorUriOutWindow, BehaviorUriOutWindow> {

        private Integer topNum = 1;

        private ListState<BehaviorUriOutWindow> listState;

        public MyUrlKeyProcessFunction(Integer topNum) {
            this.topNum = topNum;
        }

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            this.listState = getRuntimeContext().getListState(new ListStateDescriptor<BehaviorUriOutWindow>("host-url", BehaviorUriOutWindow.class));
        }

        @Override
        public void processElement(BehaviorUriOutWindow behaviorUriOutWindow, Context context, Collector<BehaviorUriOutWindow> collector) throws Exception {
            this.listState.add(behaviorUriOutWindow);
            // 注册定时器  注册时间一致 flink框架会自动覆盖掉
            context.timerService().registerEventTimeTimer(behaviorUriOutWindow.getEndTime() + 1000L);
        }

        // 定时任务触发逻辑
        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<BehaviorUriOutWindow> out) throws Exception {
            super.onTimer(timestamp, ctx, out);
            // 1. 按点击量降序排序
            List<BehaviorUriOutWindow> list = Lists.newArrayList(listState.get().iterator());
            list.sort(new Comparator<BehaviorUriOutWindow>() {
                @Override
                public int compare(BehaviorUriOutWindow o1, BehaviorUriOutWindow o2) {
                    return o2.getAggCount() - o1.getAggCount();
                }
            });

            // 2. 输出前topNum
            if (list.size() > topNum) {
                for (int i = 0; i < topNum; i++) {
                    out.collect(list.get(i));
                }
            } else {
                list.forEach(dto -> {
                    out.collect(dto);
                });
            }
        }
    }

4 结果

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值