方法一:在keyBy之前开窗(使用ProcessAllWindowFunction)
/**
* 需求:求TopN
* 实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
* 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
* 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
*
* 方式一:在keyBy之前开窗,把不同url放到同一窗口
* 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
*/
import com.sandra.bean.Event;
import com.sandra.day03.Flink01_Source_Customer;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.SlidingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
/**
* 需求:求TopN
* 实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
* 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
* 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
*
* 方式一:在keyBy之前开窗,把不同url放到同一窗口
* 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
*/
public class Flink07_TopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//用自定义的source随机产生event
SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction())
//指定watermark并分配时间戳
.assignTimestampsAndWatermarks(WatermarkStrategy
//指定watermark
.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(3))
//分配事件时间戳
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;//ms
}
})
);
//提取里面的url
SingleOutputStreamOperator<String> urlStream = streamSource.map(new MapFunction<Event, String>() {
@Override
public String map(Event value) throws Exception {
return value.url;
}
});
//统计最近10s内最热门的两个url链接,并且每5s更新一次(范围:10s->窗口大小,频率:5s->滑动步长)
//todo 在keyby之前开窗(这样不同的url就可以在同一个窗口里作比较了)
urlStream.windowAll(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.process(new TopNProcessAllWindowFun(2))
.print();
env.execute();
}
public static class TopNProcessAllWindowFun extends ProcessAllWindowFunction<String, Tuple2<String,Integer>, TimeWindow>{
private Integer topn;
public TopNProcessAllWindowFun(Integer topN){
this.topn=topN;
}
@Override
public void process(Context context, Iterable<String> elements, Collector<Tuple2<String, Integer>> out) throws Exception {
//一个窗口创建一个hashMap
HashMap<String, Integer> hashMap = new HashMap<>();
Iterator<String> iterator = elements.iterator();
while (iterator.hasNext()){
String url = iterator.next();
if (hashMap.containsKey(url)){
hashMap.put(url,hashMap.get(url)+1);
}else {
//url第一次来
hashMap.put(url,1);
}
}
//遍历hashMap中的数据进行排序
//创建List集合用来排序
ArrayList<Tuple2<String, Integer>> url2CountList = new ArrayList<>();
//将数据都取出来放到List集合中以便调用List的sort进行排序
for (String url : hashMap.keySet()) {
url2CountList.add(Tuple2.of(url,hashMap.get(url)));
}
//对该list按照浏览量从大到小进行排序
url2CountList.sort(new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return o2.f1-o1.f1;
}
});
//取出浏览量最高的url
for (int i = 0; i < topn; i++) {
out.collect(url2CountList.get(i));
}
}
}
}
自定义source:
package com.sandra.day03;
import com.sandra.bean.Event;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import java.util.Random;
//自定义Source
public class Flink01_Source_Customer {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//通过自定义source数据源获取数据
DataStreamSource<Event> streamSource = env.addSource(new MySourceFunction()).setParallelism(2);
streamSource.print();
env.execute();
}
//自定义类实现SourceFunction接口,在里面随机生成Event,默认并行度只能设置为1
// public static class MySourceFunction implements SourceFunction<Event>{
//如果想要实现多并行度,需要实现ParallelSourceFunction接口
public static class MySourceFunction implements ParallelSourceFunction<Event> {
// private Random random = new Random();//如果写在这个位置,多并行度下每个时间段产生的数据是一样的
private Boolean isRunning = true;
/**
* 该方法主要用来生成数据或者获取其他数据库数据,并且在这个方法中通常有一个while循环
* @param ctx 可以利用上下文对象的collect方法将生成或者获取的数据发送至下游
* @throws Exception
*/
@Override
public void run(SourceContext<Event> ctx) throws Exception {
Random random = new Random(); //写在这个位置,多并行度下每个时间段产生的数据是不一样的
String[] users = {"sandra","wuli"};
String[] urls = {"./home","./cart","./flink","./java"};
while (isRunning){
ctx.collect(new Event(users[random.nextInt(users.length)],urls[random.nextInt(urls.length)],System.currentTimeMillis()));
// Thread.sleep(500);
}
}
/**
* 因为在run方法中通常有一个while循环,可以通过该方法终止,这个方法不是我们手动调用,是系统内部自动调用
*/
@Override
public void cancel() {
isRunning = false;
}
}
}
方法二:keyBy一个常量再开窗(使用KeyedProcessFunction)
import com.sandra.bean.Event;
import com.sandra.day03.Flink01_Source_Customer;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.kafka.common.protocol.types.Field;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
/**
* 需求:求TopN
* 实时统计一段时间内的热门url。例如:需要统计最近10s内最热门的两个url链接,并且每5s更新一次。
* 分析:我们知道,这可以用一个滑动窗口来实现,而热门度一般可以直接用访问量来表示,于是就需要开滑动窗口
* 收集url的访问数据,按照不同的url进行统计,而后汇总排序并最终输出前两名。
*
* 方式一:在keyBy之前开窗,把不同url放到同一窗口
* 方式二:keyBy一个常量,把不同url放到同一窗口,然后开窗并求各自url访问量,再排序求出访问量前两名的url
*/
public class Flink07_TopN2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//用自定义的source随机产生event
SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction())
//指定watermark并分配时间戳
.assignTimestampsAndWatermarks(WatermarkStrategy
//指定watermark
.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(3))
//分配事件时间戳
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;//ms
}
})
);
//提取里面的url
SingleOutputStreamOperator<String> urlStream = streamSource.map(new MapFunction<Event, String>() {
@Override
public String map(Event value) throws Exception {
return value.url;
}
});
//todo keyBy一个常量然后再开窗计算
KeyedStream<String, String> keyedStream = urlStream.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String value) throws Exception {
return "1";
}
});
WindowedStream<String, String, TimeWindow> windowedStream
= keyedStream.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)));
windowedStream.process(new ProcessWindowFunction<String, Tuple2<String,Integer>, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<String> elements, Collector<Tuple2<String, Integer>> out) throws Exception {
HashMap<String, Integer> hashMap = new HashMap<>();//存放(url,浏览量)
ArrayList<Tuple2<String, Integer>> list = new ArrayList<>();
//求(url,浏览量)
for (String url : elements) {
if (hashMap.containsKey(url)){
hashMap.put(url,hashMap.get(url)+1);
}else {
hashMap.put(url,1);
}
}
//比较不同url的浏览量,取浏览量前2的(url,浏览量)
for (String url : hashMap.keySet()) {
list.add(Tuple2.of(url,hashMap.get(url)));
}
//将(url,浏览量)按照从大到小排序
list.sort(new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return o2.f1-o1.f1;
}
});
//取出浏览量前2的(url,浏览量)
for (int i = 0; i < 2; i++) {
out.collect(list.get(i));
}
}
}).print();
env.execute();
}}
方法三:(最优解)keyby url,开窗求各自url浏览量(该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常,可自行解决)
public class Flink07_TopN4 {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
DataStreamSource<Event> streamSource = env.addSource(new Flink01_Source_Customer.MySourceFunction());
//将相同url分到一个组里
KeyedStream<String, String> urlKeyedStream = streamSource.assignTimestampsAndWatermarks(WatermarkStrategy
.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})).map(new MapFunction<Event, String>() {
@Override
public String map(Event value) throws Exception {
return value.url;
}
}).keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String url) throws Exception {
return url;
}
});
//对相同url进行开窗求浏览量
SingleOutputStreamOperator<Tuple4<String, Integer, Long, Long>> urlPvStream = urlKeyedStream.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
//用增量聚合函数提前进行聚合,减少state的存储压力,并结合全窗口函数将数据包装成tuple4<url,浏览量,窗口开始时间,窗口结束时间>
.aggregate(new AggregateFunction<String, Tuple2<String, Integer>, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> createAccumulator() {
return Tuple2.of("", 0);
}
@Override
public Tuple2<String, Integer> add(String url, Tuple2<String, Integer> accumulator) {
return Tuple2.of(url, accumulator.f1 + 1);
}
@Override
public Tuple2<String, Integer> getResult(Tuple2<String, Integer> accumulator) {
return accumulator;
}
@Override
public Tuple2<String, Integer> merge(Tuple2<String, Integer> a, Tuple2<String, Integer> b) {
return null;
}
}, new ProcessWindowFunction<Tuple2<String, Integer>, Tuple4<String, Integer, Long, Long>, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<Tuple2<String, Integer>> elements, Collector<Tuple4<String, Integer, Long, Long>> out) throws Exception {
Tuple2<String, Integer> next = elements.iterator().next();//迭代器里只有一条数据
out.collect(Tuple4.of(next.f0, next.f1, context.window().getStart(), context.window().getEnd()));
}
});
//将相同窗口的不同url及其携带的信息(浏览量,窗口起止时间)聚合到一起
KeyedStream<Tuple4<String, Integer, Long, Long>, Long> windowStream = urlPvStream.keyBy(new KeySelector<Tuple4<String, Integer, Long, Long>, Long>() {
@Override
public Long getKey(Tuple4<String, Integer, Long, Long> value) throws Exception {
return value.f3;//窗口开始时间或结束时间都可以
}
});
//将相同窗口的不同url的浏览量进行从大到小排序,取出前两名
windowStream.process(new KeyedProcessFunction<Long, Tuple4<String, Integer, Long, Long>, String>() {
private ListState<Tuple4> listState;
@Override
public void open(Configuration parameters) throws Exception {
listState = getRuntimeContext().getListState(new ListStateDescriptor<Tuple4>("listState", Types.TUPLE(Types.STRING, Types.INT, Types.LONG, Types.LONG)));
}
@Override
public void processElement(Tuple4<String, Integer, Long, Long> value, Context ctx, Collector<String> out) throws Exception {
//来一条数据收集一条,直到定时器时间到了对该窗口的所有数据按照浏览量进行排序
listState.add(value);
//注册定时器,确保定时器到了该窗口所有数据都收集齐了
ctx.timerService().registerEventTimeTimer(value.f3+1);//窗口的结束时间+1ms,这里的1ms表示等待所有分区数据到最后一个算子任务的总时间
}
//定时器触发时对窗口里的数据排序求topN
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
//listState是没有sort方法的,要把状态集合里面的数据都取出来放到list集合里,再调用sort方法
ArrayList<Tuple4> list = new ArrayList<>();
Iterable<Tuple4> tuple4s = listState.get();
for (Tuple4 tuple4 : tuple4s) {
list.add(tuple4);
}
//清除状态释放资源
listState.clear();
//按照浏览量从大到小排序
list.sort(new Comparator<Tuple4>() {
@Override
public int compare(Tuple4 o1, Tuple4 o2) {
return (int)o2.f1-(int)o1.f1;
}
});
//取出top2方式一(
该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常)
// for (int i = 0; i < 2; i++) {
// out.collect("浏览量第" + (i+1) + "的url为" + list.get(i).f0 +",访问次数为"+list.get(i).f1+",所属窗口为:["+list.get(i).f2+","+list.get(i).f3+")");
//
// }
//取出top2方式二(
该代码还有一点瑕疵:求top2的过程中,如果10s只有一个url或者没有url访问,get(i)就会出现空指针异常)
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("======================\n");
Long windowStart = 0L;
Long windowEnd = 0L;
for (int i = 0; i < 2; i++) {
Tuple4 tuple4 = list.get(i);
windowStart=(Long) tuple4.f2;
windowEnd=(Long) tuple4.f3;
stringBuilder.append(" 浏览量第" + (i+1) + "的url为"+tuple4.f0)
.append("访问次数为:" + tuple4.f1)
;
}
stringBuilder .append("所属窗口为:[" +windowStart + "," +windowEnd+ ")")
.append("\n");
out.collect(stringBuilder.toString());
}
}).print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
方法四:自己造窗口(自己算窗口开始时间和定时器触发时间)需求改动一下:求每5s热门top2访问的url
import com.sandra.bean.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
/**
* 需求:模拟一个滚动窗口,我们要计算的是每一个url在每一个窗口中的pv数据,我们之前在day05:Flink05_TimeWindow_TumblingWindow_Aggre_Process
* 使用增量聚合和全窗口结合的方式实现过这个需求,这里用MapState来实现一下
*/
public class Flink11_KeyedState_MapState {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> streamSource = env.addSource(new Flink02_Source_Customer_WithWaterMark1.MySourceFunction())
.assignTimestampsAndWatermarks(WatermarkStrategy
.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
streamSource.keyBy(new KeySelector<Event, String>() {
@Override
public String getKey(Event value) throws Exception {
return value.url;
}
})
/**
*自定义一个窗口,求5s内该url访问量
* 1.窗口大小为5s
* 2.相同url要放在一起计算
* 2.1在第2个条件的前提下,相同url的相同窗口的数据要一起计算
*
* 明确:哪些相同url的窗口是同一个窗口,可以对数据过来之后依据数据本身的时间计算出属于这个数据
* 的开始时间,计算方式可以参考窗口源码中的计算窗口开始时间公式,这样的话,数据相同窗口的数据
* 计算出来的开始时间一定是相同的,又因为要对相同窗口中的相同url做计数操作。所以在属于这个窗口
* 的每条数据过来之后,要结合之前计算过的结果进行个数的计算。
*
* 通过以上分析,我们要把每个窗口之前计算过的结果保存起来,为了能够知道取得是哪个窗口的计算结果
* 可以以(k,v)->(窗口开始时间,该url该窗口浏览量)的方式保存,因此可以使用MapState,又因为MapState是分key,而key为url,因此这样
* 就满足了相同url,相同窗口的pv计算
*/
.process(new MyFakeWindow(5000)).print();//5s的滚动窗口
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
private static class MyFakeWindow extends KeyedProcessFunction<String,Event,String> {
//声明一个mapState(k,v)->(窗口开始时间,该url该窗口浏览量)
private MapState<Long,Integer> mapState;
private Integer windowSize;
public MyFakeWindow(Integer windowSize){
this.windowSize=windowSize;
}
@Override
public void open(Configuration parameters) throws Exception {
mapState = getRuntimeContext().getMapState(new MapStateDescriptor<Long, Integer>("map-state",Long.class,Integer.class));
}
@Override
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
//来一条数据获取它属于哪个窗口,求该窗口的开始时间和结束时间
// //偏移量为0也可以这么写因为2%5和7%5结果是一样的
// Long windowStart = value.timestamp - value.timestamp % windowSize;
Long windowStart = value.timestamp - (value.timestamp - 0 + windowSize) % windowSize;
Long windowEnd = windowStart + windowSize;
//如果状态里有这个窗口,状态里该ur该窗口浏览量+1(模拟一个增量聚合函数,来一条计算一条)
if (mapState.contains(windowStart)){
mapState.put(windowStart,mapState.get(windowStart) + 1);
}else{
mapState.put(windowStart,1);
}
//注册一个定时器,用来控制窗口的触发计算并关闭
ctx.timerService().registerEventTimeTimer(windowEnd - 1);//比如窗口为[0,5),定时器触发时间为4999ms
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
//获取窗口开始时间
Long windowStart = timestamp + 1 -windowSize;//timestamp:4999ms
Long windowEnd = windowStart + windowSize;
out.collect("窗口:[" + windowStart + "," + windowEnd + ") ===>pv:" + mapState.get(windowStart));
//触发器触发除了意味着触发窗口计算之外,还要关窗
mapState.remove(windowStart);
}
}
}