flink日志解析

flink 代码解析日志

public class HotPages {

    public static void main(String[] args) throws Exception {
        /**
         * 思路:开窗统计聚合 得到结果  排序输出
         */
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //并行度
        env.setParallelism(1);
        //时间语义
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        //拿数据转类型
        DataStreamSource<String> inputStream = env.readTextFile("D:\\Tool\\Idea2020\\UserBehaviorAnalysis\\NetWorkFlowAnalysis\\src\\main\\resources\\apache.log");
        //System.out.println("一号位输出完毕");
        //inputStream.print();
/**
 //拿数据的另一种方式,使用反射获取编译后的文件目录;里面的数据文件
 URL resource = HotPages.class.getResource("/apache.log");
 DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
 */
        //这里做了什么:将数据处理为想要的数据用ApacheLogEvent接收起来
        DataStream<ApacheLogEvent> dataStream = inputStream.map(new MapFunction<String, ApacheLogEvent>() {
            @Override
            public ApacheLogEvent map(String s) throws Exception {
                String[] splits = s.split(" ");
                SimpleDateFormat simpleDateFormat =
                        new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
                Long timestamp = simpleDateFormat.parse(splits[3]).getTime();

                return new ApacheLogEvent(splits[0], splits[1], timestamp, splits[5], splits[6]);
            }
        })
                //事件时间语义===>分配时间戳和WaterMark,BoundedOutOfOrdernessTimestampExtractor处理乱序数据
                .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.minutes(1)) { //加上延迟时间,即waterMark
                    @Override
                    public long extractTimestamp(ApacheLogEvent apacheLogEvent) {
                        return apacheLogEvent.getTimestamp();
                    }
                });
        // System.out.println("二号位输出完毕");
        //分组开窗聚合
        SingleOutputStreamOperator<PageViewCount> windowAggStream = dataStream
                .filter(new FilterFunction<ApacheLogEvent>() {
                    //过滤GET请求的
                    @Override
                    public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
                        return "GET".equals(apacheLogEvent.getMethod());//过滤get请求
                    }
                })
                .filter(new FilterFunction<ApacheLogEvent>() {
                     //此处对一些页面url进行过滤,数据太多
                    @Override
                    public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {

                        String regex="^((?!\\.(css|js|png|ico)$).)*$";
                        return Pattern.matches(regex,apacheLogEvent.getUrl());
                    }
                })
                //按照url分组
                .keyBy(ApacheLogEvent::getUrl)
                .timeWindow(Time.minutes(10), Time.seconds(4))
                .aggregate(new PageCountAgg(), new PageCountResult());
       // System.out.println("三号位输出完毕");
        //收集同一窗口count数据,排序输出
        SingleOutputStreamOperator<String> resultStream = windowAggStream.keyBy(PageViewCount::getWindowEnd)
                .process(new TopNHotPages(3));
        // System.out.println("四号位输出完毕");
        resultStream.print();

        env.execute("hot pages job");

    }

    //预计合
    public static class PageCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {

        @Override
        public Long createAccumulator() {
            return 0L;
        }

        @Override
        public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
            return aLong + 1;
        }

        @Override
        public Long getResult(Long aLong) {
            return aLong;
        }

        @Override
        public Long merge(Long a, Long b) {
            return a + b;
        }
    }

    //自定义窗口函数
    public static class PageCountResult implements WindowFunction<Long, PageViewCount, String, TimeWindow> {

        @Override
        public void apply(String s, TimeWindow timeWindow, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
            PageViewCount pageViewCount = new PageViewCount(s, timeWindow.getEnd(), iterable.iterator().next());
            collector.collect(pageViewCount);
        }
    }

    //自定义处理函数
    public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
        private Integer topSize;

        public TopNHotPages(Integer topSize) {
            this.topSize = topSize;
        }

        //定义状态,保存当前所有PageViewCount到list中
        ListState<PageViewCount> pageViewCountListState;

        @Override
        public void open(Configuration parameters) throws Exception {
            pageViewCountListState = getRuntimeContext()
                    .getListState(new ListStateDescriptor<PageViewCount>("pagecountlist", PageViewCount.class));

        }


        @Override
        public void processElement(PageViewCount pageViewCount, Context context, Collector<String> collector) throws Exception {
            pageViewCountListState.add(pageViewCount);
            context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd() + 1);
        }

        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
            ArrayList<PageViewCount> pageViewCounts = Lists.newArrayList(pageViewCountListState.get().iterator());
            pageViewCounts.sort(new Comparator<PageViewCount>() {
                @Override
                public int compare(PageViewCount o1, PageViewCount o2) {
                    //此方法未包含等于0的情况
                    // return o1.getCount().intValue() >o2.getCount().intValue()?-1:1;
                    //以下为更严谨写法
                    if (o1.getCount() > o2.getCount()) {
                        return -1;
                    } else if (o1.getCount() < o2.getCount()) {
                        return 1;
                    } else {
                        return 0;
                    }
                }
            });
            //格式化string输出
            StringBuilder resultBuilder = new StringBuilder();
            resultBuilder.append("======================================");
            resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp - 1)).append("\n");

            //遍历列表,取top n输出
            for (int i = 0; i < Math.min(topSize, pageViewCounts.size()); i++) {
                PageViewCount currentItemViewCount = pageViewCounts.get(i);
                resultBuilder
                        .append("No")
                        .append(i + 1)
                        .append(":")
                        .append(" URL = ")
                        .append(currentItemViewCount.getUrl())
                        .append(" 浏览量 = ")
                        .append(currentItemViewCount.getCount())
                        .append("\n");

            }
            resultBuilder.append("======================================\n\n");
            Thread.sleep(1000L);
            out.collect(resultBuilder.toString());
        }
    }
}

result:

======================================窗口结束时间:2015-05-17 10:08:52.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================


======================================窗口结束时间:2015-05-17 10:08:56.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================


======================================窗口结束时间:2015-05-17 10:09:00.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================

Pom

<!--版本管理-->
    <properties>
        <flink.version>1.10.1</flink.version>
        <scala.binary.version>2.12</scala.binary.version>
        <kafka.version>2.2.0</kafka.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <!--kafka客户端版本-->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_${scala.binary.version}</artifactId>
            <version>${kafka.version}</version>
        </dependency>
        <!--flink和kafka连接器,官方提供的-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--table API -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java-bridge_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.12</artifactId>
            <version>1.10.1</version>
        </dependency>
    </dependencies>

日志数据放于resource目录下

链接:https://pan.baidu.com/s/1RNFXsBdjNM4_lYL8L7il0A 
提取码:czz1 

优化后代码

public class HotPages {

    public static void main(String[] args) throws Exception {

        /**
         * 思路:开窗统计聚合 得到结果  排序输出
         */
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //并行度
        env.setParallelism(1);
        //时间语义
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        //拿数据转类型
        DataStreamSource<String> inputStream = env.readTextFile("D:\\Tool\\Idea2020\\UserBehaviorAnalysis\\NetWorkFlowAnalysis\\src\\main\\resources\\apache.log");
        System.out.println("一号位输出完毕");
        //inputStream.print();
/**
 //拿数据的另一种方式,使用反射获取编译后的文件目录;里面的数据文件
 URL resource = HotPages.class.getResource("/apache.log");
 DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
 */
        //这里做了什么:将数据处理为想要的数据用ApacheLogEvent接收起来
        DataStream<ApacheLogEvent> dataStream = inputStream.map(new MapFunction<String, ApacheLogEvent>() {
            @Override
            public ApacheLogEvent map(String s) throws Exception {
                String[] splits = s.split(" ");
                SimpleDateFormat simpleDateFormat =
                        new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
                Long timestamp = simpleDateFormat.parse(splits[3]).getTime();

                return new ApacheLogEvent(splits[0], splits[1], timestamp, splits[5], splits[6]);
            }
        })
                //事件时间语义===>分配时间戳和WaterMark,BoundedOutOfOrdernessTimestampExtractor处理乱序数据
                .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.seconds(1)) { //加上延迟时间,即waterMark
                    @Override
                    public long extractTimestamp(ApacheLogEvent apacheLogEvent) {
                        return apacheLogEvent.getTimestamp();
                    }
                });
        System.out.println("二号位输出完毕");


        //分组开窗聚合
        //定义一个测输出流,此测输出流类型必须和窗口一致
        OutputTag<ApacheLogEvent> lateTag =new OutputTag<ApacheLogEvent>("late"){};
        SingleOutputStreamOperator<PageViewCount> windowAggStream = dataStream
                .filter(new FilterFunction<ApacheLogEvent>() {
                    //过滤GET请求的
                    @Override
                    public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
                        return "GET".equals(apacheLogEvent.getMethod());//过滤get请求
                    }
                })
                .filter(new FilterFunction<ApacheLogEvent>() {
                     //此处对一些页面url进行过滤,数据太多
                    @Override
                    public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {

                        String regex="^((?!\\.(css|js|png|ico)$).)*$";
                        return Pattern.matches(regex,apacheLogEvent.getUrl());
                    }
                })
                //按照url分组
                .keyBy(ApacheLogEvent::getUrl)
                .timeWindow(Time.minutes(10), Time.seconds(4))
                .allowedLateness(Time.minutes(1))
                .sideOutputLateData(lateTag)
                .aggregate(new PageCountAgg(), new PageCountResult());

        windowAggStream.print("agg");
        windowAggStream.getSideOutput(lateTag).print();
        System.out.println("三号位输出完毕");
        //收集同一窗口count数据,排序输出
        SingleOutputStreamOperator<String> resultStream = windowAggStream.keyBy(PageViewCount::getWindowEnd)
                .process(new TopNHotPages(3));
        System.out.println("四号位输出完毕");
        resultStream.print();

        env.execute("hot pages job");

    }

    //预计合
    public static class PageCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {

        @Override
        public Long createAccumulator() {
            return 0L;
        }

        @Override
        public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
            return aLong + 1;
        }

        @Override
        public Long getResult(Long aLong) {
            return aLong;
        }

        @Override
        public Long merge(Long a, Long b) {
            return a + b;
        }
    }

    //自定义窗口函数
    public static class PageCountResult implements WindowFunction<Long, PageViewCount, String, TimeWindow> {

        @Override
        public void apply(String s, TimeWindow timeWindow, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
            PageViewCount pageViewCount = new PageViewCount(s, timeWindow.getEnd(), iterable.iterator().next());
            collector.collect(pageViewCount);
        }
    }

    //自定义处理函数
    public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
        private Integer topSize;

        public TopNHotPages(Integer topSize) {
            this.topSize = topSize;
        }

        //定义状态,保存当前所有PageViewCount到list中
        // ListState<PageViewCount> pageViewCountListState;
        MapState<String,Long> pageViewCountMapState;

        @Override
        public void open(Configuration parameters) throws Exception {
            pageViewCountMapState = getRuntimeContext()
                    .getMapState(new MapStateDescriptor<String,Long>("page-count-map",String.class,Long.class));

        }


        @Override
        public void processElement(PageViewCount pageViewCount, Context context, Collector<String> collector) throws Exception {
//            pageViewCountListState.add(pageViewCount);
            //优化
            pageViewCountMapState.put(pageViewCount.getUrl(),pageViewCount.getCount());

            context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd() + 1);
            //创建一分钟后窗口关闭定时器,用来清空状态
            context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd()+60*1000L);
        }

        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
            //先判断是否到了窗口关闭清理时间,是则直接清空状态返回
            if(timestamp==ctx.timestamp()+60*1000L){
                pageViewCountMapState.clear();
                return;
            }

            ArrayList<Map.Entry<String,Long>> pageViewCounts = Lists.newArrayList(pageViewCountMapState.entries().iterator());
            //优化后
            pageViewCounts.sort(new Comparator<Map.Entry<String, Long>>() {
                @Override
                public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
                    if(o1.getValue() > o2.getValue()){
                        return -1;
                    }else if(o1.getValue() < o2.getValue()){
                        return 1;
                    }else {
                        return 0;
                    }
                }
            });
//            pageViewCounts.sort(new Comparator<PageViewCount>() {
//                @Override
//                public int compare(PageViewCount o1, PageViewCount o2) {
//                    //此方法未包含等于0的情况
//                    // return o1.getCount().intValue() >o2.getCount().intValue()?-1:1;
//                    //以下为更严谨写法
//                    if (o1.getCount() > o2.getCount()) {
//                        return -1;
//                    } else if (o1.getCount() < o2.getCount()) {
//                        return 1;
//                    } else {
//                        return 0;
//                    }
//                }
//            });
            //格式化string输出
            StringBuilder resultBuilder = new StringBuilder();
            resultBuilder.append("======================================");
            resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp - 1)).append("\n");

            //遍历列表,取top n输出
            for (int i = 0; i < Math.min(topSize, pageViewCounts.size()); i++) {
//                PageViewCount currentItemViewCount = pageViewCounts.get(i);
                Map.Entry<String, Long> currentItemViewCount = pageViewCounts.get(i);
                resultBuilder
                        .append("No")
                        .append(i + 1)
                        .append(":")
                        .append(" URL = ")
                        .append(currentItemViewCount.getKey())
                        .append(" 浏览量 = ")
                        .append(currentItemViewCount.getValue())
                        .append("\n");

            }
            resultBuilder.append("======================================\n\n");
            Thread.sleep(1000L);
            out.collect(resultBuilder.toString());
            //为了清空状态
           //pageViewCounts.clear();
        }
    }
}

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值