实战二说明:
看视频学习之二,继续根据实战一往后进行学习
1、首先新建模块 目录结构:
同样我们模块内的pom不需要进行修改 利用主pom即可
2、数据源(这里数据源使用文件读取,展示部分内容)
3、代码展示
首先是pojo类里边的接收类,把接收到的数据转成对象
package Bean;
public class ApachLogEvent {
private String ip;
private String userId;
private Long timestamp;
private String method;
private String url;
public ApachLogEvent() {
}
public ApachLogEvent(String ip, String userId, Long timestamp, String method, String url) {
this.ip = ip;
this.userId = userId;
this.timestamp = timestamp;
this.method = method;
this.url = url;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public String getMethod() {
return method;
}
public void setMethod(String method) {
this.method = method;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Bean.ApachLogEvent{" +
"ip='" + ip + '\'' +
", userId='" + userId + '\'' +
", timestamp=" + timestamp +
", method='" + method + '\'' +
", url='" + url + '\'' +
'}';
}
}
然后是转换类,用于记录结果及部分属性
package Bean;
public class PageViewCount {
private String url;
private Long windowEnd;
private Long count;
public PageViewCount() {
}
public PageViewCount(String url, Long windowEnd, Long count) {
this.url = url;
this.windowEnd = windowEnd;
this.count = count;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Long getWindowEnd() {
return windowEnd;
}
public void setWindowEnd(Long windowEnd) {
this.windowEnd = windowEnd;
}
public Long getCount() {
return count;
}
public void setCount(Long count) {
this.count = count;
}
@Override
public String toString() {
return "PageViewCount{" +
"url='" + url + '\'' +
", windowEnd=" + windowEnd +
", count=" + count +
'}';
}
}
最后是项目代码flink主程序
代码主要逻辑:
1、转换数据成pojo类、顺便进行数据过滤
2、开窗统计每个url在该滑动窗口的访问次数
3、再根据窗口末尾时间分组,统计该时间段的窗口内前三名的url
package Project;
import Bean.ApachLogEvent;
import Bean.PageViewCount;
import com.sun.org.apache.xerces.internal.dom.PSVIElementNSImpl;
import org.apache.commons.compress.utils.Lists;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.net.URL;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Comparator;
public class HotPages {
public static void main(String[] args) throws Exception{
//创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//设置并行度
env.setParallelism(1);
//读取文件 转换成POJO类型
URL resource = HotPages.class.getResource("/apache.log");
DataStream<String> inputStream = env.readTextFile(resource.getPath());
DataStream<ApachLogEvent> dataStream = inputStream
.map(line -> {
//分隔文件内容
String[] fields = line.split(" ");
//格式化时间格式
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
Long timestamp = simpleDateFormat.parse(fields[3]).getTime();
return new ApachLogEvent(fields[0], fields[1],timestamp,fields[5],fields[6]);
})
//Watermark 设置延迟时间
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApachLogEvent>(Time.seconds(60)) {
@Override
public long extractTimestamp(ApachLogEvent apachLogEvent) {
return apachLogEvent.getTimestamp();
}
});
//分组开窗聚合
//首先过滤 只要get请求即可
SingleOutputStreamOperator<PageViewCount> windowAggStream = dataStream.filter(data -> "GET".equals(data.getMethod()))
//按照url分组
//.keyBy(data ->data.getUrl())
.keyBy(ApachLogEvent::getUrl)
//.keyBy("url")
//开窗 10分钟统计一次 5秒钟滑动一次
.timeWindow(Time.minutes(10), Time.seconds(5))
.aggregate(new PageCountAgg(), new PageCountResult());
//收集同一窗口count数据,排序输出
DataStream<String> resultStream = windowAggStream.keyBy(PageViewCount::getWindowEnd)
.process(new TopNHotPages(3));
resultStream.print();
env.execute("hot pages job");
}
//自定义预聚合函数
public static class PageCountAgg implements AggregateFunction<ApachLogEvent, Long, Long>{
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ApachLogEvent apachLogEvent, Long aLong) {
return aLong+1;
}
@Override
public Long getResult(Long aLong) {
return aLong;
}
@Override
public Long merge(Long aLong, Long acc1) {
return aLong+acc1;
}
}
//实现自定义窗口函数
public static class PageCountResult implements WindowFunction<Long, PageViewCount, String, TimeWindow>{
@Override
public void apply(String s, TimeWindow timeWindow, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
collector.collect( new PageViewCount(s, timeWindow.getEnd(), iterable.iterator().next()));;
}
}
//实现自定义的窗口函数
public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount,String>{
private static Integer topSize;
public TopNHotPages(int i) {
topSize=i;
}
//定义状态 保存当前所有PageViewCount到List中
ListState<PageViewCount> pageViewCountListState;
@Override
public void processElement(PageViewCount pageViewCount, Context context, Collector<String> collector) throws Exception {
pageViewCountListState.add(pageViewCount);
//定时器+1
context.timerService().registerProcessingTimeTimer(pageViewCount.getWindowEnd()+1);
}
@Override
public void open(Configuration parameters) throws Exception {
pageViewCountListState = getRuntimeContext().getListState(new ListStateDescriptor<PageViewCount>("page-count-list",PageViewCount.class));
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
ArrayList<PageViewCount> pageViewCounts = Lists.newArrayList(pageViewCountListState.get().iterator());
//排序
pageViewCounts.sort(new Comparator<PageViewCount>() {
@Override
public int compare(PageViewCount o1, PageViewCount o2) {
if(o1.getCount()>o2.getCount()){
//不用排序
return -1;
}else if(o1.getCount()<o2.getCount()){
return 1;
}else{
return 0;
}
}
});
//格式化成字符串输出
//排名信息格式化成string,方便打印输出
StringBuilder resultBuilder = new StringBuilder();
resultBuilder.append("==========");
resultBuilder.append("窗口结束时间: ").append( new Timestamp(timestamp-1)).append("\n");
//遍历列表,取topN输出
for(int i = 0;i<Math.min(topSize,pageViewCounts.size());i++){
PageViewCount currentItemViewCount = pageViewCounts.get(i);
resultBuilder.append("NO ").append(i+1).append(":")
.append(" url=").append(currentItemViewCount.getUrl())
.append("热门度: ").append(currentItemViewCount.getCount())
.append("\n");
}
resultBuilder.append("==========\n");
Thread.sleep(1000L);
out.collect(resultBuilder.toString());
}
}
}
4、结果展示