需求
计算最近10秒内访问次数最多的2个url,每5秒更新一次。
自定义数据源
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Calendar;
import java.util.Random;
/**
* Author:panghu
* Date:2022-04-19
* Description:自定义数据源,SourceFunction接口实现的数据源并行度只能设置为1
*/
public class ClickSource implements SourceFunction<Event> {
private boolean flag = true;
@Override
public void run(SourceContext<Event> sourceContext) throws Exception {
// 向下游发送数据
Random random = new Random();
String[] users = {"Mary", "Alice", "Bob", "Cary"};
String[] urls = {
"./home", "./cart", "./fav", "./prod?id=1", "./prod?id=2"
};
while (flag) {
sourceContext.collect(
new Event(
users[random.nextInt(users.length)],
urls[random.nextInt(urls.length)],
Calendar.getInstance().getTimeInMillis()
)
);
// 每隔一秒生成一次数据
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
// 通过标识位控制退出循环,中断数据源
flag = false;
}
}
准备POJO类
/**
* Author:panghu
* Date:2022-07-23
* Description:
*/
import java.sql.Timestamp;
public class UrlViewCount {
public String url;
public Long count;
public Long windowStart;
public Long windowEnd;
public UrlViewCount() {
}
public UrlViewCount(String url, Long count, Long windowStart, Long windowEnd) {
this.url = url;
this.count = count;
this.windowStart = windowStart;
this.windowEnd = windowEnd;
}
@Override
public String toString() {
return "UrlViewCount{" +
"url='" + url + '\'' +
", count=" + count +
", windowStart=" + new Timestamp(windowStart) +
", windowEnd=" + new Timestamp(windowEnd) +
'}';
}
}
TopN实现
import com.wanshun.bigdata.chapter05.ClickSource;
import com.wanshun.bigdata.chapter05.Event;
import com.wanshun.bigdata.java.UrlViewCount;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Comparator;
/**
* Author:panghu
* Date:2022-07-23
* Description: 在KeyedByProcessFunction中,注册定时器统计窗口topN的数据,
* 只有在keyedProcessFunction中才可以注册定时器
*/
public class _04KeyedProcessTopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Event> streamSource = env.addSource(new ClickSource());
// 提取时间戳,生成水位线
SingleOutputStreamOperator<Event> streamOperator = streamSource.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timeStamp;
}
})
);
streamOperator.print();
// 统计每个窗口内访问次数最多的两个url
streamOperator.keyBy(data -> data.url)
// 窗口大小为10,每5秒输出一次
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.aggregate(new UrlAggFunction(), new UrlProcessFunction())
// 统计每个窗口内的url访问次数,按照窗口结束时间keyBy
.keyBy(data -> data.windowEnd)
.process(new TopNKeyedProcessFuntion(2))
.print();
env.execute();
}
private static class UrlAggFunction implements AggregateFunction<Event, Long, Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Event event, Long acc) {
return acc + 1;
}
@Override
public Long getResult(Long acc) {
return acc;
}
@Override
public Long merge(Long acc, Long acc1) {
return null;
}
}
private static class UrlProcessFunction extends ProcessWindowFunction<Long, UrlViewCount, String, TimeWindow> {
@Override
public void process(String key, Context context, Iterable<Long> elements, Collector<UrlViewCount> out) throws Exception {
// 获取了累加器中每个链接的访问次数
Long cnt = elements.iterator().next();
// 封装成POJO类并输出
UrlViewCount urlViewCount = new UrlViewCount(key, cnt, context.window().getStart(), context.window().getEnd());
out.collect(urlViewCount);
}
}
private static class TopNKeyedProcessFuntion extends KeyedProcessFunction<Long, UrlViewCount, String> {
private Integer n;
private ListState<UrlViewCount> listState;
// topN
public TopNKeyedProcessFuntion(Integer n) {
this.n = n;
}
// open方法是KeyedProcessFunction父类中的方法,可以获取运行时状态信息
@Override
public void open(Configuration parameters) throws Exception {
// 获取列表状态信息
listState = getRuntimeContext().getListState(
new ListStateDescriptor<UrlViewCount>("url-count", Types.POJO(UrlViewCount.class))
);
}
// 上游每来一次数据,处理一次
@Override
public void processElement(UrlViewCount value, Context ctx, Collector<String> out) throws Exception {
// 数据添加到状态列表中
listState.add(value);
// 注册定时器,水位线代表当前时间之前的所有数据都已经到齐了,只要多等待1ms,就可以保证窗口的数据全部到齐了
ctx.timerService().registerEventTimeTimer(ctx.getCurrentKey() + 1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// ArrayList方便排序
ArrayList<UrlViewCount> urlViewCountArrayList = new ArrayList<>();
for (UrlViewCount urlViewCount : listState.get()) {
urlViewCountArrayList.add(urlViewCount);
}
// 清空状态列表,释放资源
listState.clear();
// 排序
urlViewCountArrayList.sort(new Comparator<UrlViewCount>() {
@Override
public int compare(UrlViewCount o1, UrlViewCount o2) {
return o2.count.intValue() - o1.count.intValue();
}
});
StringBuilder sb = new StringBuilder();
sb.append("=============================================\n");
sb.append("当前窗口结束时间为:" + new Timestamp(timestamp - 1) + "\n");
sb.append("当前水位线为:" + new Timestamp(ctx.timerService().currentWatermark()) + "\n");
// 取TopN
for (int i = 0; i < n; i++) {
sb.append("No." + (i + 1) + ": " + urlViewCountArrayList.get(i).url
+ " count: " + urlViewCountArrayList.get(i).count
+ "\n"
);
}
sb.append("=============================================\n");
out.collect(sb.toString());
}
}
}