Flink 案例2-基于log日志统计浏览量统计
需求:每隔 5 秒,输出最近 1 分钟内访问量最多的前 N 个 URL
1 pom
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>1.13.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.13.0</version>
</dependency>
</dependencies>
2 log数据
208.115.111.72 - - 17/05/2015:11:05:29 +0000 GET /scripts/grok-py-test/
208.115.111.72 - - 17/05/2015:11:05:08 +0000 GET /?N=A&page=21
208.115.111.72 - - 17/05/2015:11:05:49 +0000 GET /blog/geekery/oniguruma-named-capture-example.html?commentlimit=0
208.115.111.72 - - 17/05/2015:11:05:01 +0000 GET /blog/geekery/ssh-key-invalid-hack.html?commentlimit=0
208.115.111.72 - - 17/05/2015:11:05:31 +0000 GET /blog/geekery/server-side-javascript.html
208.115.111.72 - - 17/05/2015:11:05:15 +0000 GET /blog/geekery/yahoo-hackday-08.html
105.235.130.196 - - 17/05/2015:11:05:01 +0000 GET /images/googledotcom.png
174.37.205.76 - - 17/05/2015:11:05:19 +0000 GET /blog
54.255.13.204 - - 17/05/2015:11:05:03 +0000 GET /articles/ssh-security/
105.235.130.196 - - 17/05/2015:11:05:45 +0000 GET /blog/tags/X11
54.255.13.204 - - 17/05/2015:11:05:55 +0000 GET /reset.css
54.255.13.204 - - 17/05/2015:11:05:32 +0000 GET /style2.css
54.255.13.204 - - 17/05/2015:11:05:10 +0000 GET /favicon.ico
105.235.130.196 - - 17/05/2015:11:05:20 +0000 GET /reset.css
54.255.13.204 - - 17/05/2015:11:05:46 +0000 GET /images/jordan-80.png
54.255.13.204 - - 17/05/2015:11:05:17 +0000 GET /images/web/2009/banner.png
105.235.130.196 - - 17/05/2015:11:05:47 +0000 GET /style2.css
105.235.130.196 - - 17/05/2015:11:05:37 +0000 GET /images/jordan-80.png
105.235.130.196 - - 17/05/2015:11:05:22 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2015:11:05:01 +0000 GET /projects/xdotool/
134.76.249.10 - - 17/05/2015:11:05:09 +0000 GET /reset.css
134.76.249.10 - - 17/05/2015:11:05:57 +0000 GET /style2.css
134.76.249.10 - - 17/05/2015:11:05:23 +0000 GET /favicon.ico
134.76.249.10 - - 17/05/2015:11:05:40 +0000 GET /images/jordan-80.png
134.76.249.10 - - 17/05/2015:11:05:50 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2015:11:05:47 +0000 GET /projects/xdotool
134.76.249.10 - - 17/05/2015:11:05:13 +0000 GET /projects/xdotool/
66.249.73.135 - - 17/05/2015:11:05:26 +0000 GET /?flav=atom
207.241.237.220 - - 17/05/2015:11:05:24 +0000 GET /blog/tags/C?page=2
68.184.202.186 - - 17/05/2015:11:05:28 +0000 GET /projects/xpathtool/
68.184.202.186 - - 17/05/2015:11:05:02 +0000 GET /reset.css
68.184.202.186 - - 17/05/2015:11:05:05 +0000 GET /images/jordan-80.png
68.184.202.186 - - 17/05/2015:11:05:02 +0000 GET /style2.css
68.184.202.186 - - 17/05/2015:11:05:37 +0000 GET /images/web/2009/banner.png
68.184.202.186 - - 17/05/2015:11:05:58 +0000 GET /favicon.ico
46.105.14.53 - - 17/05/2015:11:05:29 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2015:11:05:00 +0000 GET /?flav=rss20
24.233.162.179 - - 17/05/2015:11:05:31 +0000 GET /favicon.ico
123.125.71.117 - - 17/05/2015:11:05:16 +0000 GET /
220.181.108.153 - - 17/05/2015:11:05:09 +0000 GET /
65.19.138.34 - - 17/05/2015:11:05:40 +0000 GET /
66.249.73.135 - - 17/05/2015:11:05:32 +0000 GET /blog/geekery/rhapsody-on-linux.html
97.116.185.190 - - 17/05/2015:11:05:59 +0000 GET /articles/dynamic-dns-with-dhcp/
97.116.185.190 - - 17/05/2015:11:05:39 +0000 GET /reset.css
97.116.185.190 - - 17/05/2015:11:05:29 +0000 GET /style2.css
97.116.185.190 - - 17/05/2015:11:05:39 +0000 GET /images/jordan-80.png
97.116.185.190 - - 17/05/2015:11:05:02 +0000 GET /images/web/2009/banner.png
97.116.185.190 - - 17/05/2015:11:05:35 +0000 GET /favicon.ico
5.255.72.168 - - 17/05/2015:11:05:21 +0000 GET /
5.255.72.168 - - 17/05/2015:11:05:08 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
46.105.14.53 - - 17/05/2015:11:05:33 +0000 GET /blog/tags/puppet?flav=rss20
5.102.173.71 - - 17/05/2015:11:05:13 +0000 GET /robots.txt
5.102.173.71 - - 17/05/2015:11:05:06 +0000 GET /projects/xdotool/
208.91.156.11 - - 17/05/2015:11:05:05 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
66.249.73.185 - - 17/05/2015:11:05:58 +0000 GET /presentations/logstash-1/
74.125.176.81 - - 17/05/2015:11:05:28 +0000 GET /?flav=rss20
66.249.73.135 - - 17/05/2015:11:05:14 +0000 GET /blog/geekery/xdotool-2.20110530.html
187.45.193.158 - - 17/05/2015:11:05:54 +0000 GET /presentations/logstash-1/file/about-me/tequila-face.jpg
90.220.199.149 - - 17/05/2015:11:05:18 +0000 GET /blog/geekery/puppet-manage-homedirectory-contents.html
90.220.199.149 - - 17/05/2015:11:05:24 +0000 GET /reset.css
90.220.199.149 - - 17/05/2015:11:05:50 +0000 GET /style2.css
90.220.199.149 - - 17/05/2015:12:05:37 +0000 GET /images/jordan-80.png
90.220.199.149 - - 17/05/2015:12:05:21 +0000 GET /images/web/2009/banner.png
90.220.199.149 - - 17/05/2015:12:05:17 +0000 GET /favicon.ico
36.38.8.174 - - 17/05/2015:12:05:24 +0000 GET /blog/geekery/ssl-latency.html
36.38.8.174 - - 17/05/2015:12:05:36 +0000 GET /reset.css
36.38.8.174 - - 17/05/2015:12:05:14 +0000 GET /style2.css
36.38.8.174 - - 17/05/2015:12:05:44 +0000 GET /images/jordan-80.png
36.38.8.174 - - 17/05/2015:12:05:17 +0000 GET /images/web/2009/banner.png
36.38.8.174 - - 17/05/2015:12:05:39 +0000 GET /favicon.ico
71.207.12.53 - - 17/05/2015:12:05:17 +0000 GET /favicon.ico
220.241.45.142 - - 17/05/2015:12:05:07 +0000 GET /robots.txt
220.241.45.142 - - 17/05/2015:12:05:30 +0000 GET /projects/firefox-tabsearch/
209.85.238.199 - - 17/05/2015:12:05:21 +0000 GET /?flav=atom
46.105.14.53 - - 17/05/2015:12:05:53 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2015:12:05:28 +0000 GET /blog/tags/noise
146.1.1.2 - - 17/05/2015:12:05:51 +0000 GET /blog/geekery/ssl-latency.html
146.1.1.2 - - 17/05/2015:12:05:18 +0000 GET /style2.css
146.1.1.2 - - 17/05/2015:12:05:55 +0000 GET /reset.css
146.1.1.2 - - 17/05/2015:12:05:24 +0000 GET /favicon.ico
3 code
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 设置时间时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStream<ApacheLogDto> dataStream = env.readTextFile("D:\\linxq\\mime\\code\\flink\\src\\main\\resources\\LogTest.csv")
.flatMap(new MyFlatMapFunction())
.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<ApacheLogDto>() {
@Override
public long extractAscendingTimestamp(ApacheLogDto apacheLogDto) {
return apacheLogDto.getTimestamp();
}
});
// 分组开窗计算
SingleOutputStreamOperator<BehaviorUriOutWindow> aggStream = dataStream.keyBy(ApacheLogDto::getUrl) // 按url分组
.timeWindow(Time.seconds(60), Time.seconds(5)) // 开窗 长度60s 每5s滑动一次
.aggregate(new CountAgg(), new AggResultWinFunction());// 聚合计算
// 按窗口分区 计算访问量高前N个URI
aggStream.keyBy(BehaviorUriOutWindow::getEndTime).process(new MyUrlKeyProcessFunction(2))
.print("hot url:").setParallelism(1); // 输出
env.execute();
}
/**
* String 转 dto
* in: 79.185.184.23 - - 17/05/2015:12:05:31 +0000 GET /reset.css
*/
public static class MyFlatMapFunction implements FlatMapFunction<String, ApacheLogDto> {
private SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
private String regex = "^((?!\\.(css|js|png|ico|html|txt|jar|jpg)$).)*$";
@Override
public void flatMap(String s, Collector<ApacheLogDto> collector) throws Exception {
String[] split = s.split(" ");
ApacheLogDto apacheLogDto = new ApacheLogDto();
apacheLogDto.setIpAddr(split[0]);
apacheLogDto.setTimestamp(simpleDateFormat.parse(split[3]).getTime());
apacheLogDto.setMethod(split[5]);
apacheLogDto.setUrl(split[6].split("\\?")[0]); // 切割问号
// 过滤css js png ico html txt jpg后缀的url
if (Pattern.matches(regex, apacheLogDto.getUrl())) {
collector.collect(apacheLogDto);
}
}
}
/**
* 聚合计算
*/
public static class CountAgg implements AggregateFunction<ApacheLogDto, Integer, Integer> {
// 初始值
@Override
public Integer createAccumulator() {
return 0;
}
// 聚合逻辑
@Override
public Integer add(ApacheLogDto apacheLogDto, Integer acc) {
return acc + 1;
}
// 结果值
@Override
public Integer getResult(Integer acc) {
return acc;
}
// 合并操作
@Override
public Integer merge(Integer acc, Integer acc1) {
return acc + acc1;
}
}
/**
* 窗口关闭 转换
*/
public static class AggResultWinFunction implements WindowFunction<Integer, BehaviorUriOutWindow, String, TimeWindow> {
@Override
public void apply(String key, TimeWindow timeWindow, Iterable<Integer> iterable, Collector<BehaviorUriOutWindow> collector) throws Exception {
// 获取当前窗口聚合计算结果
Integer aggCount = iterable.iterator().next();
BehaviorUriOutWindow outDto = new BehaviorUriOutWindow();
outDto.setUrl(key);
outDto.setAggCount(aggCount);
outDto.setEndTime(timeWindow.getEnd());
collector.collect(outDto);
}
}
/**
* 计算访问量高前N个URI并输出
*/
public static class MyUrlKeyProcessFunction extends KeyedProcessFunction<Long, BehaviorUriOutWindow, BehaviorUriOutWindow> {
private Integer topNum = 1;
private ListState<BehaviorUriOutWindow> listState;
public MyUrlKeyProcessFunction(Integer topNum) {
this.topNum = topNum;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
this.listState = getRuntimeContext().getListState(new ListStateDescriptor<BehaviorUriOutWindow>("host-url", BehaviorUriOutWindow.class));
}
@Override
public void processElement(BehaviorUriOutWindow behaviorUriOutWindow, Context context, Collector<BehaviorUriOutWindow> collector) throws Exception {
this.listState.add(behaviorUriOutWindow);
// 注册定时器 注册时间一致 flink框架会自动覆盖掉
context.timerService().registerEventTimeTimer(behaviorUriOutWindow.getEndTime() + 1000L);
}
// 定时任务触发逻辑
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<BehaviorUriOutWindow> out) throws Exception {
super.onTimer(timestamp, ctx, out);
// 1. 按点击量降序排序
List<BehaviorUriOutWindow> list = Lists.newArrayList(listState.get().iterator());
list.sort(new Comparator<BehaviorUriOutWindow>() {
@Override
public int compare(BehaviorUriOutWindow o1, BehaviorUriOutWindow o2) {
return o2.getAggCount() - o1.getAggCount();
}
});
// 2. 输出前topNum
if (list.size() > topNum) {
for (int i = 0; i < topNum; i++) {
out.collect(list.get(i));
}
} else {
list.forEach(dto -> {
out.collect(dto);
});
}
}
}