Flink 流处理高阶编程实战案例
基于埋点日志数据的网络流量统计
指定时间范围内网站总浏览量(PV)的统计
每个小时的pv统计
package com.yire.practice.highlevel;
import com.yire.bean.UserBehavior;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class Flink_High_Level_PV {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
DataStreamSource<String> data = env.readTextFile("input/UserBehavior.csv");
WatermarkStrategy<UserBehavior> wms = WatermarkStrategy
.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
data
.flatMap(new FlatMapFunction<String, UserBehavior>() {
@Override
public void flatMap(String value, Collector<UserBehavior> out) throws Exception {
String[] line = value.split(",");
UserBehavior userBehavior = new UserBehavior(
Long.valueOf(line[0]),
Long.valueOf(line[1]),
Integer.valueOf(line[2]),
line[3],
Long.valueOf(line[4]));
if ("pv".equals(userBehavior.getBehavior())) {
out.collect(userBehavior);
}
}
})
.assignTimestampsAndWatermarks(wms)
.keyBy(UserBehavior::getBehavior)
.window(TumblingEventTimeWindows.of(Time.hours(1)))
.aggregate(new AggregateFunction<UserBehavior, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(UserBehavior value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
})
.print();
env.execute();
}
}
指定时间范围内网站独立访客数(UV)的统计
每小时内的用户数量
package com.yire.practice.highlevel;
import com.google.common.collect.Iterators;
import com.yire.bean.UserBehavior;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
public class Flink_High_Level_UV {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
DataStreamSource<String> data = env.readTextFile("input/UserBehavior.csv");
WatermarkStrategy<UserBehavior> wms = WatermarkStrategy
.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
data
.flatMap(new FlatMapFunction<String, UserBehavior>() {
@Override
public void flatMap(String value, Collector<UserBehavior> out) throws Exception {
String[] line = value.split(",");
UserBehavior userBehavior = new UserBehavior(
Long.valueOf(line[0]),
Long.valueOf(line[1]),
Integer.valueOf(line[2]),
line[3],
Long.valueOf(line[4]));
if ("pv".equals(userBehavior.getBehavior())) {
out.collect(userBehavior);
}
}
})
.assignTimestampsAndWatermarks(wms)
.keyBy(UserBehavior::getBehavior)
.window(TumblingEventTimeWindows.of(Time.hours(1)))
.process(new ProcessWindowFunction<UserBehavior, String, String, TimeWindow>() {
private MapState<String, String> state;
@Override
public void open(Configuration parameters) throws Exception {
state = getRuntimeContext().getMapState(new MapStateDescriptor<String, String>("state", String.class, String.class));
}
@Override
public void process(String key, Context context, Iterable<UserBehavior> elements, Collector<String> out) throws Exception {
for (UserBehavior element : elements) {
state.put(element.getUserId().toString(), "");
}
int size = Iterators.size(state.keys().iterator());
out.collect(size + "");
}
})
.print();
env.execute();
}
}
电商数据分析
实时热门商品统计
每隔5分钟输出最近1小时内点击量最多的前N个商品
package com.yire.practice.highlevel;
import com.yire.bean.HotItem;
import com.yire.bean.UserBehavior;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class Flink_High_Level_TopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/UserBehavior.csv");
WatermarkStrategy<UserBehavior> wms = WatermarkStrategy
.<UserBehavior>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
data
.flatMap(new FlatMapFunction<String, UserBehavior>() {
@Override
public void flatMap(String value, Collector<UserBehavior> out) throws Exception {
String[] line = value.split(",");
UserBehavior userBehavior = new UserBehavior(
Long.valueOf(line[0]),
Long.valueOf(line[1]),
Integer.valueOf(line[2]),
line[3],
Long.valueOf(line[4]));
if ("pv".equals(userBehavior.getBehavior())) {
out.collect(userBehavior);
}
}
})
.assignTimestampsAndWatermarks(wms)
.keyBy(UserBehavior::getItemId)
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.minutes(5)))
.aggregate(new AggregateFunction<UserBehavior, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(UserBehavior value, Long accumulator) {
return accumulator + 1L;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}, new ProcessWindowFunction<Long, HotItem, Long, TimeWindow>() {
@Override
public void process(Long key, Context context, Iterable<Long> elements, Collector<HotItem> out) throws Exception {
out.collect(new HotItem(key, elements.iterator().next(), context.window().getEnd()));
}
})
.keyBy(HotItem::getWindowEndTime)
.process(new KeyedProcessFunction<Long, HotItem, String>() {
private ValueState<Long> timeState;
private ListState<HotItem> resultState;
@Override
public void open(Configuration parameters) throws Exception {
resultState = getRuntimeContext().getListState(new ListStateDescriptor<HotItem>("ResultState", HotItem.class));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("TimeState", Long.class));
}
@Override
public void processElement(HotItem value, Context ctx, Collector<String> out) throws Exception {
if (timeState.value() == null) {
timeState.update(ctx.getCurrentKey() + 1000L);
ctx.timerService().registerEventTimeTimer(timeState.value());
}
resultState.add(value);
List<HotItem> result = new ArrayList<>();
for (HotItem hotItem : resultState.get()) {
result.add(hotItem);
}
result.sort((o1, o2) -> o2.getCount().compareTo(o1.getCount()));
if (result.size() > 3) {
result.remove(result.size() - 1);
}
resultState.update(result);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
Iterator<HotItem> result = resultState.get().iterator();
// topN
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间: ").append(timestamp - 1000).append("\n");
sb.append("---------------------------------\n");
while (result.hasNext()) {
sb.append(result.next()).append("\n");
}
sb.append("---------------------------------\n\n");
out.collect(sb.toString());
resultState.clear();
timeState.clear();
}
}).setParallelism(1)
.print();
env.execute();
}
}
基于服务器log的热门页面浏览量统计
每隔5秒,输出最近10分钟内访问量最多的前N个URL
package com.yire.practice.highlevel;
import com.yire.bean.ApacheLog;
import com.yire.bean.PageCount;
import com.yire.bean.PageCount;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class Flink_High_Level_PageTopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/apache.log");
WatermarkStrategy<ApacheLog> wms = WatermarkStrategy
.<ApacheLog>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> element.getEventTime());
data
.flatMap(new FlatMapFunction<String, ApacheLog>() {
@Override
public void flatMap(String value, Collector<ApacheLog> out) throws Exception {
String[] line = value.split(" ");
SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
ApacheLog apacheLog = new ApacheLog(
line[0],
format.parse(line[3]).getTime(),
line[5],
line[6]);
out.collect(apacheLog);
}
})
.assignTimestampsAndWatermarks(wms)
.keyBy(ApacheLog::getUrl)
.window(SlidingEventTimeWindows.of(Time.minutes(10), Time.seconds(5)))
.aggregate(new AggregateFunction<ApacheLog, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ApacheLog value, Long accumulator) {
return accumulator + 1L;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}, new ProcessWindowFunction<Long, PageCount, String, TimeWindow>() {
@Override
public void process(String key, Context context, Iterable<Long> elements, Collector<PageCount> out) throws Exception {
out.collect(new PageCount(key, elements.iterator().next(), context.window().getEnd()));
}
})
.keyBy(PageCount::getWindowEnd)
.process(new KeyedProcessFunction<Long, PageCount, String>() {
private ValueState<Long> timeState;
private ListState<PageCount> resultState;
@Override
public void open(Configuration parameters) throws Exception {
resultState = getRuntimeContext().getListState(new ListStateDescriptor<PageCount>("ResultState", PageCount.class));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("TimeState", Long.class));
}
@Override
public void processElement(PageCount value, Context ctx, Collector<String> out) throws Exception {
if (timeState.value() == null) {
timeState.update(ctx.getCurrentKey() + 1000L);
ctx.timerService().registerEventTimeTimer(timeState.value());
}
resultState.add(value);
List<PageCount> result = new ArrayList<>();
for (PageCount PageCount : resultState.get()) {
result.add(PageCount);
}
result.sort((o1, o2) -> o2.getCount().compareTo(o1.getCount()));
if (result.size() > 3) {
result.remove(result.size() - 1);
}
resultState.update(result);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
Iterator<PageCount> result = resultState.get().iterator();
// topN
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间: ").append(timestamp - 1000).append("\n");
sb.append("---------------------------------\n");
while (result.hasNext()) {
sb.append(result.next()).append("\n");
}
sb.append("---------------------------------\n\n");
out.collect(sb.toString());
resultState.clear();
timeState.clear();
}
})
.print();
env.execute();
}
}
页面广告分析
页面广告点击量统计
十秒显示一次近一个小时内的广告点击量前三排行
package com.yire.practice.highlevel;
import com.yire.bean.AdsClickLog;
import com.yire.bean.AdsCount;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class Flink_High_Level_AdTopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/AdClickLog.csv");
WatermarkStrategy<AdsClickLog> wms = WatermarkStrategy
.<AdsClickLog>forBoundedOutOfOrderness(Duration.ofMillis(3000L))
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
data
.flatMap(new FlatMapFunction<String, AdsClickLog>() {
@Override
public void flatMap(String value, Collector<AdsClickLog> out) throws Exception {
String[] line = value.split(",");
AdsClickLog adsClickLog = new AdsClickLog(Long.valueOf(line[0]),
Long.valueOf(line[1]),
line[2],
line[3],
Long.valueOf(line[4]));
out.collect(adsClickLog);
}
})
.assignTimestampsAndWatermarks(wms)
.map(new MapFunction<AdsClickLog, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(AdsClickLog value) throws Exception {
return Tuple2.of(value.getProvince(), value.getAdId());
}
})
.keyBy(new KeySelector<Tuple2<String, Long>, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> getKey(Tuple2<String, Long> value) throws Exception {
return value;
}
})
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.seconds(10)))
// .window(TumblingEventTimeWindows.of(Time.hours(1)))
.aggregate(new AggregateFunction<Tuple2<String, Long>, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Tuple2<String, Long> value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}, new ProcessWindowFunction<Long, AdsCount, Tuple2<String, Long>, TimeWindow>() {
@Override
public void process(Tuple2<String, Long> key, Context context, Iterable<Long> elements, Collector<AdsCount> out) throws Exception {
out.collect(new AdsCount(key.f0,
key.f1,
elements.iterator().next(),
context.window().getEnd()));
}
})
.keyBy(AdsCount::getTimeEnd)
.process(new KeyedProcessFunction<Long, AdsCount, String>() {
private ValueState<Long> timeState;
private ListState<AdsCount> resultState;
@Override
public void open(Configuration parameters) throws Exception {
resultState = getRuntimeContext().getListState(new ListStateDescriptor<AdsCount>("ResultState", AdsCount.class));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("TimeState", Long.class));
}
@Override
public void processElement(AdsCount value, Context ctx, Collector<String> out) throws Exception {
if (timeState.value() == null) {
timeState.update(ctx.getCurrentKey() + 1000L);
ctx.timerService().registerEventTimeTimer(timeState.value());
}
resultState.add(value);
List<AdsCount> result = new ArrayList<>();
for (AdsCount adsCount : resultState.get()) {
result.add(adsCount);
}
result.sort((o1, o2) -> o2.getCount().compareTo(o1.getCount()));
if (result.size() > 3) {
result.remove(result.size() - 1);
}
resultState.update(result);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
Iterator<AdsCount> result = resultState.get().iterator();
// topN
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间: ").append(timestamp - 1000).append("\n");
sb.append("---------------------------------\n");
while (result.hasNext()) {
sb.append(result.next()).append("\n");
}
sb.append("---------------------------------\n\n");
out.collect(sb.toString());
resultState.clear();
timeState.clear();
}
})
.print();
env.execute();
}
}
上面的基础上每个省份的广告点击前三
package com.yire.practice.highlevel;
import com.yire.bean.AdsClickLog;
import com.yire.bean.AdsCount;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
public class Flink_High_Level_AdTopN02 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/AdClickLog.csv");
WatermarkStrategy<AdsClickLog> wms = WatermarkStrategy
.<AdsClickLog>forBoundedOutOfOrderness(Duration.ofMillis(3000L))
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
data
.flatMap(new FlatMapFunction<String, AdsClickLog>() {
@Override
public void flatMap(String value, Collector<AdsClickLog> out) throws Exception {
String[] line = value.split(",");
AdsClickLog adsClickLog = new AdsClickLog(Long.valueOf(line[0]),
Long.valueOf(line[1]),
line[2],
line[3],
Long.valueOf(line[4]));
out.collect(adsClickLog);
}
})
.assignTimestampsAndWatermarks(wms)
.map(new MapFunction<AdsClickLog, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(AdsClickLog value) throws Exception {
return Tuple2.of(value.getProvince(), value.getAdId());
}
})
.keyBy(new KeySelector<Tuple2<String, Long>, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> getKey(Tuple2<String, Long> value) throws Exception {
return value;
}
})
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.seconds(10)))
// .window(TumblingEventTimeWindows.of(Time.hours(1)))
.aggregate(new AggregateFunction<Tuple2<String, Long>, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Tuple2<String, Long> value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}, new ProcessWindowFunction<Long, AdsCount, Tuple2<String, Long>, TimeWindow>() {
@Override
public void process(Tuple2<String, Long> key, Context context, Iterable<Long> elements, Collector<AdsCount> out) throws Exception {
out.collect(new AdsCount(key.f0,
key.f1,
elements.iterator().next(),
context.window().getEnd()));
}
})
.keyBy(AdsCount::getTimeEnd)
.process(new KeyedProcessFunction<Long, AdsCount, String>() {
private MapState<String, List<AdsCount>> result;
private ValueState<Long> timeState;
@Override
public void open(Configuration parameters) throws Exception {
result = getRuntimeContext().getMapState(new MapStateDescriptor<String, List<AdsCount>>("result", Types.STRING, Types.LIST(Types.POJO(AdsCount.class))));
timeState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("TimeState", Long.class));
}
@Override
public void processElement(AdsCount value, Context ctx, Collector<String> out) throws Exception {
if (timeState.value() == null) {
timeState.update(ctx.getCurrentKey() + 1000L);
ctx.timerService().registerEventTimeTimer(timeState.value());
}
List<AdsCount> adsCounts = result.get(value.getProvince());
adsCounts = adsCounts == null ? new ArrayList<>() : adsCounts;
adsCounts.add(value);
adsCounts.sort((o1, o2) -> o2.getCount().compareTo(o1.getCount()));
if (adsCounts.size() > 3) {
adsCounts.remove(adsCounts.size() - 1);
}
result.put(value.getProvince(), adsCounts);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// topN
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间: ").append(timestamp - 1000).append("\n");
sb.append("---------------------------------\n");
for (String key : result.keys()) {
sb.append(key).append("===\n");
List<AdsCount> adsCounts = result.get(key);
for (AdsCount adsCount : adsCounts) {
sb.append(adsCount).append("\n");
}
sb.append("===\n");
}
sb.append("---------------------------------\n\n");
out.collect(sb.toString());
result.clear();
timeState.clear();
}
})
.print();
env.execute();
}
}
黑名单过滤
一天内同一个用户对同一个广告点击超过100次
两个功能:
- 告警: 使用侧输出流
- 已经进入黑名单的用户的广告点击记录不再进行统计
package com.yire.practice.highlevel;
import com.yire.bean.AdsClickLog;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.time.Duration;
public class Flink_High_Level_BlackList {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/AdClickLog.csv");
WatermarkStrategy<AdsClickLog> wms = WatermarkStrategy
.<AdsClickLog>forBoundedOutOfOrderness(Duration.ofMillis(3000L))
.withTimestampAssigner((element, recordTimestamp) -> element.getTimestamp() * 1000);
SingleOutputStreamOperator<String> result = data
.flatMap(new FlatMapFunction<String, AdsClickLog>() {
@Override
public void flatMap(String value, Collector<AdsClickLog> out) throws Exception {
String[] line = value.split(",");
AdsClickLog adsClickLog = new AdsClickLog(Long.valueOf(line[0]),
Long.valueOf(line[1]),
line[2],
line[3],
Long.valueOf(line[4]));
out.collect(adsClickLog);
}
})
.assignTimestampsAndWatermarks(wms)
.map(new MapFunction<AdsClickLog, Tuple2<Long, Long>>() {
@Override
public Tuple2<Long, Long> map(AdsClickLog value) throws Exception {
return Tuple2.of(value.getUserId(), value.getAdId());
}
})
.keyBy(new KeySelector<Tuple2<Long, Long>, Tuple2<Long, Long>>() {
@Override
public Tuple2<Long, Long> getKey(Tuple2<Long, Long> value) throws Exception {
return value;
}
})
.process(new KeyedProcessFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, String>() {
private ValueState<Boolean> isFilter;
private ValueState<Long> timeEnd;
private ValueState<Long> sum;
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
isFilter.update(false);
sum.clear();
}
@Override
public void open(Configuration parameters) throws Exception {
sum = getRuntimeContext().getState(new ValueStateDescriptor<Long>("sum", Long.class));
timeEnd = getRuntimeContext().getState(new ValueStateDescriptor<Long>("timeEnd", Long.class));
isFilter = getRuntimeContext().getState(new ValueStateDescriptor<Boolean>("isFilter", Boolean.class));
}
@Override
public void processElement(Tuple2<Long, Long> value, Context ctx, Collector<String> out) throws Exception {
//在黑名单
if (isFilter.value() != null && isFilter.value()) {
//直接返回
return;
}
//次数有没有超过100
long count = (sum.value() == null ? 0L : sum.value()) + 1;
sum.update(count);
if (count > 100) {
//加入黑名单
isFilter.update(true);
//告警
ctx.output(new OutputTag<Long>("blackList") {
}, value.f0);
//一天后解除
long l = ctx.timestamp() + (1000 * 60 * 60 * 24);
timeEnd.update(l);
ctx.timerService().registerEventTimeTimer(l);
} else {
out.collect("用户=" + value.f0 + ",广告=" + value.f1 + ",点击量=" + count);
}
}
});
result
.print("正常数据");
result
.getSideOutput(new OutputTag<Long>("blackList") {
})
.print("黑名单");
env.execute();
}
}
恶意登录监控
package com.yire.practice.highlevel;
import com.yire.bean.LoginEvent;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.List;
public class Flink_High_Level_Login {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(5);
DataStreamSource<String> data = env.readTextFile("input/LoginLog.csv");
WatermarkStrategy<LoginEvent> wms = WatermarkStrategy
// .<LoginEvent>forBoundedOutOfOrderness(Duration.ofMillis(3000L))
.<LoginEvent>forMonotonousTimestamps()
.withTimestampAssigner((element, recordTimestamp) -> element.getEventTime() * 1000);
data
.flatMap(new FlatMapFunction<String, LoginEvent>() {
@Override
public void flatMap(String value, Collector<LoginEvent> out) throws Exception {
String[] line = value.split(",");
LoginEvent result = new LoginEvent(
Long.valueOf(line[0]),
line[1],
line[2],
Long.valueOf(line[3]));
out.collect(result);
}
})
.assignTimestampsAndWatermarks(wms)
//根据用户id
.keyBy(LoginEvent::getUserId)
.process(new KeyedProcessFunction<Long, LoginEvent, String>() {
private ListState<Long> lastLoginFailTime;
@Override
public void open(Configuration parameters) throws Exception {
lastLoginFailTime = getRuntimeContext().getListState(new ListStateDescriptor<Long>("state", Long.class));
}
@Override
public void processElement(LoginEvent value, Context ctx, Collector<String> out) throws Exception {
Long now = value.getEventTime();
List<Long> result = new ArrayList<>();
lastLoginFailTime.get().forEach(result::add);
result.add(now);
if ("fail".equals(value.getEventType())) {
if (result.size() >= 2) {
//两秒内
if (now - result.get(result.size() - 2) <= 2) {
//恶意登录了
out.collect("用户:" + value + ",上次登录失败时间:" + result.get(result.size() - 2));
}
//移除第一个
result.remove(0);
}
//第一次登录失败
lastLoginFailTime.update(result);
} else {
//登录成功清除状态
lastLoginFailTime.clear();
}
}
}
)
.print();
env.execute();
}
}
订单支付实时监控
package com.yire.practice.highlevel;
import com.yire.bean.OrderEvent;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.environment.LocalStreamEnvironment;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
public class Flink_High_Level_OrderEvent {
public static void main(String[] args) throws Exception {
LocalStreamEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
WatermarkStrategy<OrderEvent> wms = WatermarkStrategy
.<OrderEvent>forBoundedOutOfOrderness(Duration.ofMillis(1000 * 15))
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent>() {
@Override
public long extractTimestamp(OrderEvent element, long recordTimestamp) {
return element.getEventTime() * 1000;
}
});
env
.readTextFile("input/OrderLog.csv")
.map(new MapFunction<String, OrderEvent>() {
@Override
public OrderEvent map(String value) throws Exception {
String[] line = value.split(",");
return new OrderEvent(Long.valueOf(line[0]), line[1], line[2], Long.valueOf(line[3]));
}
})
.assignTimestampsAndWatermarks(wms)
.keyBy(OrderEvent::getOrderId)
.process(new KeyedProcessFunction<Long, OrderEvent, String>() {
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
if (payState.value() != null) {
OrderEvent value = payState.value();
out.collect("该支付订单没有对应创建的订单:" + value.getOrderId());
}
if (createState.value() != null) {
OrderEvent value = createState.value();
out.collect("超时没有支付的订单:" + value.getOrderId());
}
}
private ValueState<Long> time;
private ValueState<OrderEvent> payState;
private ValueState<OrderEvent> createState;
@Override
public void open(Configuration parameters) throws Exception {
payState = getRuntimeContext().getState(new ValueStateDescriptor<OrderEvent>("payState", OrderEvent.class));
createState = getRuntimeContext().getState(new ValueStateDescriptor<OrderEvent>("createState", OrderEvent.class));
time = getRuntimeContext().getState(new ValueStateDescriptor<Long>("time", Long.class));
}
@Override
public void processElement(OrderEvent value, Context ctx, Collector<String> out) throws Exception {
if ("create".equals(value.getEventType())) {
if (payState.value() == null) {
createState.update(value);
} else {
OrderEvent pay = payState.value();
if (pay.getEventTime() - value.getEventTime() > 15 * 60) {
out.collect("订单十五分钟超时:" + value.getOrderId());
} else {
out.collect("订单正常" + value.getOrderId());
}
}
} else {
if (createState.value() == null) {
payState.update(value);
} else {
OrderEvent create = createState.value();
if (value.getEventTime() - create.getEventTime() > 15 * 60) {
out.collect("订单十五分钟超时:" + value.getOrderId());
} else {
out.collect("订单正常" + value.getOrderId());
}
}
}
if (time.value() == null) {
long registerTime = (value.getEventTime() + 20 * 60) * 1000;
time.update(registerTime);
ctx.timerService().registerEventTimeTimer(registerTime);
} else {
ctx.timerService().deleteEventTimeTimer(time.value());
}
}
})
.print();
env.execute();
}
}