Flink流处理高阶编程实战
目录
一、基于埋点日志数据的网络流量统计
1.1 指定时间范围内网站总浏览量(PV)的统计
实现一个网站总浏览量的统计。我们可以设置滚动时间窗口,实时统计每小时内的网站PV。此前我们已经完成了该需求的流数据操作,当前需求是在之前的基础上增加了窗口信息
public class P01_PV {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<UserBehavior> wms = WatermarkStrategy
.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>(){
@Override
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
return element.getTimestamp()*1000l;
}
});
env.readTextFile("input/UserBehavior.csv")
.map(line->{
String[] split = line.split(",");
return new UserBehavior(Long.valueOf(split[0]), Long.valueOf(split[1]), Integer.valueOf(split[2]), split[3], Long.valueOf(split[4]));
})
.filter(bh -> "pv".equalsIgnoreCase(bh.getBehavior())) //过滤出pv点击行为
.assignTimestampsAndWatermarks(wms) //添加水印
.map(new MapFunction<UserBehavior, Tuple2<String,Long>>() {
@Override
public Tuple2<String, Long> map(UserBehavior value) throws Exception {
return Tuple2.of("pv",1l);
}
})
.keyBy(t->t.f0)
.window(TumblingEventTimeWindows.of(Time.minutes(60)))
.sum(1)
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
1.2 指定时间范围内网站独立访客数(UV)的统计
public class P02_UV {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
env.readTextFile("input/UserBehavior.csv")
.map(line->{
String[] split = line.split(",");
return new UserBehavior(Long.valueOf(split[0]), Long.valueOf(split[1]), Integer.valueOf(split[2]), split[3], Long.valueOf(split[4]));
})
.filter(bh -> "pv".equalsIgnoreCase(bh.getBehavior())) //过滤出pv点击行为
//添加水印
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((data,ts)->data.getTimestamp()*1000l)
)
//分到一个组内进行聚合
.keyBy(t->t.getBehavior())
.window(TumblingEventTimeWindows.of(Time.minutes(60)))
.process(new ProcessWindowFunction<UserBehavior, String, String, TimeWindow>() {
private MapState<Long, Object> userIdState;
@Override
public void open(Configuration parameters) throws Exception {
userIdState = getRuntimeContext().
getMapState(new MapStateDescriptor<Long, Object>("userIdState", Long.class, Object.class));
}
@Override
public void process(String s,
Context context,
Iterable<UserBehavior> elements,
Collector<String> out) throws Exception {
userIdState.clear();
for (UserBehavior element : elements) {
userIdState.put(element.getUserId(),new Object());
}
long count =0;
for (Long key : userIdState.keys()) {
count++;
}
StringBuffer sb = new StringBuffer();
sb.append("[").append(context.window().getStart()).append(",").append(context.window().getEnd())
.append("] count=").append(count);
out.collect(sb.toString());
}
})
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
二、电商数据分析
电商平台中的用户行为频繁且较复杂,系统上线运行一段时间后,可以收集到大量的用户行为数据,进而利用大数据技术进行深入挖掘和分析,得到感兴趣的商业指标并增强对风险的控制。
电商用户行为数据多样,整体可以分为用户行为习惯数据和业务行为数据两大类。
用户的行为习惯数据包括了用户的登录方式、上线的时间点及时长、点击和浏览页面、页面停留时间以及页面跳转等等,我们可以从中进行流量统计和热门商品的统计,也可以深入挖掘用户的特征;这些数据往往可以从web服务器日志中直接读取到。
而业务行为数据就是用户在电商平台中针对每个业务(通常是某个具体商品)所作的操作,我们一般会在业务系统中相应的位置埋点,然后收集日志进行分析。
2.1 实时热门商品统计
需求分析
每隔5分钟输出最近1小时内点击量最多的前N个商品
- 最近一小时: 窗口长度
- 每隔5分钟: 窗口滑动步长
- 时间: 使用event-time
数据准备
- 这里依然采用UserBehavior.csv作为数据源,通过采集数据统计商品点击信息。
pojo类:
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public class HotItem {
private Long itemId;
private Long count;
private Long windowEndTime;
}
具体实现代码
public class P03_HotItem {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<UserBehavior> wms = WatermarkStrategy
.<UserBehavior>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner<UserBehavior>() {
@Override
public long extractTimestamp(UserBehavior element, long recordTimestamp) {
return element.getTimestamp() * 1000l;
}
});
env.readTextFile("input/UserBehavior.csv")
.map(line -> {
String[] split = line.split(",");
return new UserBehavior(Long.valueOf(split[0]), Long.valueOf(split[1]), Integer.valueOf(split[2]), split[3], Long.valueOf(split[4]));
})
.filter(bh -> "pv".equalsIgnoreCase(bh.getBehavior())) //过滤出pv点击行为
//添加水印
.assignTimestampsAndWatermarks(wms)
.keyBy(UserBehavior::getItemId)
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.minutes(5)))
.aggregate(
new AggregateFunction<UserBehavior, Long, Long>() {
@Override
public Long createAccumulator() {
return 0l;
}
@Override
public Long add(UserBehavior value, Long accumulator) {
return accumulator + 1l;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}
,
new ProcessWindowFunction<Long, HotItem, Long, TimeWindow>() {
@Override
public void process(Long aLong,
Context context,
Iterable<Long> elements,
Collector<HotItem> out) throws Exception {
out.collect(new HotItem(aLong, elements.iterator().next(), context.window().getEnd()));
}
}
)
//对窗口进行聚合(同一时间的窗口)
.keyBy(HotItem::getWindowEndTime)
.process(new KeyedProcessFunction<Long, HotItem, String>() {
private ValueState<Long> timerTsState;
private ListState<HotItem> hotItemState;
@Override
public void open(Configuration parameters) throws Exception {
hotItemState = getRuntimeContext()
.getListState(new ListStateDescriptor<HotItem>("hotItemState", HotItem.class));
timerTsState = getRuntimeContext()
.getState(new ValueStateDescriptor<Long>("timerTsState", Long.class));
}
@Override
public void processElement(HotItem value,
Context ctx,
Collector<String> out) throws Exception {
//定义定时器,在5+1秒后触发
hotItemState.add(value);
//什么时间定义定时器
if (timerTsState.value()==null){
//注册定时器
long timerTs = value.getWindowEndTime()+ 1000l;
ctx.timerService().registerEventTimeTimer(timerTs);
//更新定时器的触发时间状态
timerTsState.update(timerTs);
}
}
@Override
public void onTimer(long timestamp,
OnTimerContext ctx,
Collector<String> out) throws Exception {
ArrayList<HotItem> hotItems = new ArrayList<>();
for (HotItem hotItem : hotItemState.get()) {
hotItems.add(hotItem);
}
hotItems.sort((o1,o2)->o2.getCount().compareTo(o1.getCount()));
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间:").append(timestamp-1000).append("\n");
for (int i=0;i<Math.min(hotItems.size(),3);i++){
sb.append(hotItems.get(i)).append("\n");
}
out.collect(sb.toString());
//清除状态,节省缓存
timerTsState.clear();
hotItemState.clear();
}
})
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
2.2 基于服务器log的热门页面浏览量统计
对于一个电商平台而言,用户登录的入口流量、不同页面的访问流量都是值得分析的重要数据,而这些数据,可以简单地从web服务器的日志中提取出来。
我们在这里先实现“热门页面浏览数”的统计,也就是读取服务器日志中的每一行log,统计在一段时间内用户访问每一个url的次数,然后排序输出显示。
具体做法为:每隔5秒,输出最近10分钟内访问量最多的前N个URL。可以看出,这个需求与之前“实时热门商品统计”非常类似,所以我们完全可以借鉴此前的代码。
具体实现代码
public class P04_HotPags {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<ApacheLog> wms = WatermarkStrategy
.<ApacheLog>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner(new SerializableTimestampAssigner<ApacheLog>() {
@Override
public long extractTimestamp(ApacheLog element, long recordTimestamp) {
return element.getEventTime() ;
}
});
env.readTextFile("input/apache.log")
.map(line -> {
String[] split = line.split(" ");
SimpleDateFormat df = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
return new ApacheLog(
split[0],
df.parse(split[3]).getTime(),
split[5],
split[6] );
})
//添加水印
.assignTimestampsAndWatermarks(wms)
.keyBy(ApacheLog::getUrl)
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.minutes(5)))
.aggregate(
new AggregateFunction<ApacheLog, Long, Long>() {
@Override
public Long createAccumulator() {
return 0l;
}
@Override
public Long add(ApacheLog value, Long accumulator) {
return accumulator + 1l;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
},
new ProcessWindowFunction<Long, PageCount, String, TimeWindow>() {
@Override
public void process(String key,
Context context,
Iterable<Long> elements,
Collector<PageCount> out) throws Exception {
out.collect(new PageCount(key,elements.iterator().next(),
context.window().getEnd()));
}
}
)
.keyBy(PageCount::getWindowEnd)
.process(new KeyedProcessFunction<Long, PageCount, String>() {
private ValueState<Long> timerTs;
private ListState<PageCount> pageState;
@Override
public void open(Configuration parameters) throws Exception {
pageState = getRuntimeContext()
.getListState(new ListStateDescriptor<PageCount>("pageState", PageCount.class));
timerTs = getRuntimeContext()
.getState(new ValueStateDescriptor<Long>("timerTs", Long.class));
}
@Override
public void processElement(PageCount value,
Context ctx,
Collector<String> out) throws Exception {
pageState.add(value);
if (timerTs.value()==null){
ctx.timerService().registerEventTimeTimer(value.getWindowEnd() +1000l);
timerTs.update(value.getWindowEnd());
}
}
@Override
public void onTimer(long timestamp,
OnTimerContext ctx,
Collector<String> out) throws Exception {
// TreeSet<PageCount> pageCounts = new TreeSet<>();
TreeSet<PageCount> pageCounts = new TreeSet<>(new Comparator<PageCount>() {
@Override
public int compare(PageCount o1, PageCount o2) {
if (o1.getCount() < o2.getCount()) return 1;
else return -1;
}
});
for (PageCount pageCount : pageState.get()) {
pageCounts.add(pageCount);
//长度一旦达到4,删除最后一个元素
if (pageCounts.size() > 3){
pageCounts.pollLast();//删除最后一个元素
}
}
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间:").append(timestamp-1000).append("\n");
for (PageCount pageCount : pageCounts) {
sb.append(pageCount).append("\n");
}
out.collect(sb.toString());
pageState.clear();
timerTs.clear();
}
})
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
三、页面广告分析
3.1 页面广告点击量统计
电商网站的市场营销商业指标中,除了自身的APP推广,还会考虑到页面上的广告投放(包括自己经营的产品和其它网站的广告)。所以广告相关的统计分析,也是市场营销的重要指标。
对于广告的统计,最简单也最重要的就是页面广告的点击量,网站往往需要根据广告点击量来制定定价策略和调整推广方式,而且也可以借此收集用户的偏好信息。更加具体的应用是,我们可以根据用户的地理位置进行划分,从而总结出不同省份用户对不同广告的偏好,这样更有助于广告的精准投放。
在之前的需求实现中,已经统计的广告的点击次数总和,但是没有实现窗口操作,并且也未增加排名处理.
这次添加窗口, 并增加排名
public class Flink05_Project_AdsClick {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
// 创建WatermarkStrategy
WatermarkStrategy<AdsClickLog> wms = WatermarkStrategy
.<AdsClickLog>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<AdsClickLog>() {
@Override
public long extractTimestamp(AdsClickLog element, long recordTimestamp) {
return element.getTimestamp() * 1000L;
}
});
env
.readTextFile("input/AdClickLog.csv")
.map(line -> {
String[] datas = line.split(",");
return new AdsClickLog(Long.valueOf(datas[0]),
Long.valueOf(datas[1]),
datas[2],
datas[3],
Long.valueOf(datas[4]));
})
.assignTimestampsAndWatermarks(wms)
// 安装 (省份, 广告) 分组
.keyBy(new KeySelector<AdsClickLog, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> getKey(AdsClickLog log) throws Exception {
return Tuple2.of(log.getProvince(), log.getAdId());
}
})
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.seconds(10)))
.allowedLateness(Time.seconds(10))
.sideOutputLateData(new OutputTag<AdsClickLog>("ads_late") {
})
.aggregate(new AggregateFunction<AdsClickLog, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(AdsClickLog value, Long accumulator) {
return accumulator + 1L;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}, new ProcessWindowFunction<Long, Tuple4<String, Long, Long, Long>, Tuple2<String, Long>, TimeWindow>() {
@Override
public void process(Tuple2<String, Long> key, Context ctx, Iterable<Long> elements, Collector<Tuple4<String, Long, Long, Long>> out) throws Exception {
out.collect(Tuple4.of(key.f0, key.f1, elements.iterator().next(), ctx.window().getEnd()));
}
})
.keyBy(t -> t.f3)
.process(new KeyedProcessFunction<Long, Tuple4<String, Long, Long, Long>, String>() {
private ValueState<Long> windowEnd;
private ListState<Tuple4<String, Long, Long, Long>> datas;
@Override
public void open(Configuration parameters) throws Exception {
datas = getRuntimeContext()
.getListState(new ListStateDescriptor<Tuple4<String, Long, Long, Long>>("datas", TypeInformation.of(new TypeHint<Tuple4<String, Long, Long, Long>>() {
})));
windowEnd = getRuntimeContext().getState(new ValueStateDescriptor<Long>("windowEnd", Long.class));
}
@Override
public void processElement(Tuple4<String, Long, Long, Long> value, Context ctx, Collector<String> out) throws Exception {
// 存数据
datas.add(value);
// 注册定时器
if (windowEnd.value() == null) {
ctx.timerService().registerEventTimeTimer(value.f3 + 10L);
windowEnd.update(value.f3);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
ArrayList<Tuple4<String, Long, Long, Long>> result = new ArrayList<>();
for (Tuple4<String, Long, Long, Long> t : datas.get()) {
result.add(t);
}
// 清空状态
windowEnd.clear();
datas.clear();
// 排序, 取top3
result.sort(new Comparator<Tuple4<String, Long, Long, Long>>() {
@Override
public int compare(Tuple4<String, Long, Long, Long> o1, Tuple4<String, Long, Long, Long> o2) {
return (int) (o2.f2 - o1.f2);
}
});
// 返回的数据
StringBuilder sb = new StringBuilder();
sb.append("窗口结束时间: ").append(timestamp - 10).append("\n");
sb.append("---------------------------------\n");
for (int i = 0; i < Math.min(3, result.size()); i++) {
sb.append(result.get(i)).append("\n");
}
sb.append("---------------------------------\n\n");
out.collect(sb.toString());
}
})
.print();
env.execute();
}
}
3.2 黑名单过滤
我们进行的点击量统计,同一用户的重复点击是会叠加计算的。
在实际场景中,同一用户确实可能反复点开同一个广告,这也说明了用户对广告更大的兴趣;但是如果用户在一段时间非常频繁地点击广告,这显然不是一个正常行为,有刷点击量的嫌疑。
所以我们可以对一段时间内(比如一天内)的用户点击行为进行约束,如果对同一个广告点击超过一定限额(比如100次),应该把该用户加入黑名单并报警,此后其点击行为不应该再统计。
两个功能:
- 告警: 使用侧输出流
- 已经进入黑名单的用户的广告点击记录不再进行统计
//黑名单过滤
public class P05_BlackList {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<AdsClickLog> wms = WatermarkStrategy
.<AdsClickLog>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<AdsClickLog>() {
@Override
public long extractTimestamp(AdsClickLog element, long recordTimestamp) {
return element.getTimestamp()*1000;
}
});
SingleOutputStreamOperator<String> mainStream = env.readTextFile("input/AdClickLog.csv")
.map(line -> {
String[] split = line.split(",");
return new AdsClickLog(
Long.valueOf(split[0]),
Long.valueOf(split[1]),
split[2],
split[3],
Long.valueOf(split[4])
);
})
.assignTimestampsAndWatermarks(wms)
//统计每个用户每个广告的点击量
.keyBy(log -> log.getUserId()+"_"+log.getAdId())
.process(new KeyedProcessFunction<String, AdsClickLog, String>() {
private ValueState<Boolean> warnState;
private ValueState<Long> timerState;
private ReducingState<Long> countState;
@Override
public void open(Configuration parameters) throws Exception {
countState = getRuntimeContext().getReducingState(new ReducingStateDescriptor<Long>("countState",
new ReduceFunction<Long>() {
@Override
public Long reduce(Long value1, Long value2) throws Exception {
return value1 + value2;
}
}, Long.class));
timerState = getRuntimeContext().getState(new ValueStateDescriptor<Long>("timerState", Long.class));
warnState = getRuntimeContext().getState(new ValueStateDescriptor<Boolean>("warnState", Boolean.class));
}
@Override
public void processElement(AdsClickLog value,
Context ctx,
Collector<String> out) throws Exception {
//判断是否需要加入黑名单(排除空值操作)
if (countState.get() != null && countState.get() > 99) {
//黑名单加入一次即可
if (warnState.value() == null) {
String msg = "用户:" + value.getUserId() +
"广告:" + value.getAdId() +
"点击量:" + countState.get();
ctx.output(new OutputTag<String>("blackList") {
}, msg);
warnState.update(true);
}
} else {
countState.add(1l);
//正常流
String msg = "用户:" + value.getUserId() +
"广告:" + value.getAdId() +
"点击量:" + countState.get();
out.collect(msg);
}
if (timerState.value() == null) {
//获取当前时间
long now = ctx.timestamp() / 1000;
//转换为毫秒值(设置为东八区)
LocalDateTime today = LocalDateTime.ofEpochSecond(now, 0, ZoneOffset.ofHours(8));
//获取明日的毫秒值
LocalDateTime tomorrow = LocalDateTime.of(today.toLocalDate().plusDays(1),
LocalTime.of(0, 0, 0));
long time = tomorrow.toEpochSecond(ZoneOffset.ofHours(8));
//注册定时器
ctx.timerService().registerEventTimeTimer(time * 1000);
timerState.update(time * 1000);
}
}
@Override
public void onTimer(long timestamp,
OnTimerContext ctx,
Collector<String> out) throws Exception {
countState.clear();
timerState.clear();
warnState.clear();
}
});
mainStream.print("main");
mainStream.getSideOutput(new OutputTag<String>("blackList") {}).print("blackList");
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
四、恶意登录监控
对于网站而言,用户登录并不是频繁的业务操作。如果一个用户短时间内频繁登录失败,就有可能是出现了程序的恶意攻击,比如密码暴力破解。
因此我们考虑,应该对用户的登录失败动作进行统计,具体来说,如果同一用户(可以是不同IP)在2秒之内连续两次登录失败,就认为存在恶意登录的风险,输出相关的信息进行报警提示。这是电商网站、也是几乎所有网站风控的基本一环。
封装数据的JavaBean类
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class LoginEvent {
private Long userId;
private String ip;
private String eventType;
private Long eventTime;
}
具体实现代码
实现逻辑:
统计连续失败的次数:
- 把失败的时间戳放入到List中,
- 当List中的长度到达2的时候, 判断这个两个时间戳的差是否小于等于2s
- 如果是, 则这个用户在恶意登录
- 否则不是, 然后删除List的第一个元素
- 用于保持List的长度为2
- 如果出现成功, 则需要清空List集合
public class P06_Login_Even {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<LoginEvent> wms = WatermarkStrategy
.<LoginEvent>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<LoginEvent>() {
@Override
public long extractTimestamp(LoginEvent element, long recordTimestamp) {
return element.getEventTime();
}
});
env
.readTextFile("input/LoginLog.csv")
.map(line -> {
String[] data = line.split(",");
return new LoginEvent(Long.valueOf(data[0]),
data[1],
data[2],
Long.parseLong(data[3]) * 1000L);
})
.assignTimestampsAndWatermarks(wms)
.keyBy(LoginEvent::getUserId)
.process(new KeyedProcessFunction<Long, LoginEvent, String>() {
private ListState<Long> state;
@Override
public void open(Configuration parameters) throws Exception {
state = getRuntimeContext().getListState(new ListStateDescriptor<Long>("state", Long.class));
}
@Override
public void processElement(LoginEvent value,
Context ctx,
Collector<String> out) throws Exception {
/*
统计连续失败的次数:
1. 把失败的时间戳放入到List中,
2. 当List中的长度到达2的时候, 判断这个两个时间戳的差是否小于等于2s
3. 如果是, 则这个用户在恶意登录
4. 否则不是, 然后删除List的第一个元素用于保持List的长度为2
6. 如果出现登录成功, 则需要清空List集合, 重新开始计算
*/
//两次连续失败不超过2s
if ("fail".equalsIgnoreCase(value.getEventType())){
//1.把时间戳放入状态
state.add(value.getEventTime());
//取出所有时间
ArrayList<Long> tss = new ArrayList<>();
for (Long aLong : state.get()) {
tss.add(aLong);
}
//有两个失败
if (tss.size() == 2){
//long delebet = Math.abs( tss.get(1) - tss.get(0) ) / 1000 ;
long delebet = (tss.get(1) - tss.get(0) ) / 1000 ;
if (delebet <=2){
out.collect(ctx.getCurrentKey() + "在恶意登陆");
}//时间间隔大于两秒,删除第一个
tss.remove(0);
state.update(tss);
}
}else {
state.clear();
}
}
}).print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
使用窗口
使用窗口处理乱序数据,但是会有重复数据,还需要根据窗口再次去重
public class P06_Login_Even_Window {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
WatermarkStrategy<LoginEvent> wms = WatermarkStrategy
.<LoginEvent>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<LoginEvent>() {
@Override
public long extractTimestamp(LoginEvent element, long recordTimestamp) {
return element.getEventTime();
}
});
env
.readTextFile("input/LoginLog.csv")
.map(line -> {
String[] data = line.split(",");
return new LoginEvent(Long.valueOf(data[0]),
data[1],
data[2],
Long.parseLong(data[3]) * 1000L);
})
.filter(log -> "fail".equalsIgnoreCase(log.getEventType()))
.assignTimestampsAndWatermarks(wms)
.keyBy(LoginEvent::getUserId)
.window(SlidingEventTimeWindows.of(Time.seconds(5),Time.seconds(2)))
.process(new ProcessWindowFunction<LoginEvent, String, Long, TimeWindow>() {
@Override
public void process(Long aLong,
Context context,
Iterable<LoginEvent> elements,
Collector<String> out) throws Exception {
ArrayList<LoginEvent> loginEvents = new ArrayList<>();
for (LoginEvent element : elements) {
loginEvents.add(element);
// System.out.println(element);
}
loginEvents.sort(new Comparator<LoginEvent>() {
@Override
public int compare(LoginEvent o1, LoginEvent o2) {
return o1.getEventTime().compareTo(o2.getEventTime());
}
});
for (LoginEvent element : elements) {
//System.out.println(element);
}
for (int i=1; i<loginEvents.size(); i++){
LoginEvent x2 = loginEvents.get(i);
LoginEvent x1 = loginEvents.get(i-1);
if (x2.getEventTime() - x1.getEventTime() < 2000){
String msg = "用户:"+x1.getUserId() + " ["+ x1.getEventTime()
+ ", " + x2.getEventTime() +"] 违规登陆";
out.collect(msg);
}
}
}
})
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
使用CEP模式匹配
package com.flink.charpter10;
import com.flink.bean.LoginEvent;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternSelectFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.time.Duration;
import java.util.List;
import java.util.Map;
public class T01_Evail_Login {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
// 创建WatermarkStrategy
WatermarkStrategy<LoginEvent> wms = WatermarkStrategy
.<LoginEvent>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<LoginEvent>() {
@Override
public long extractTimestamp(LoginEvent element, long recordTimestamp) {
return element.getEventTime();
}
});
KeyedStream<LoginEvent, Long> loginKS = env
.readTextFile("input/LoginLog.csv")
.map(line -> {
String[] data = line.split(",");
return new LoginEvent(Long.valueOf(data[0]),
data[1],
data[2],
Long.parseLong(data[3]) * 1000L);
})
.assignTimestampsAndWatermarks(wms)
.keyBy(LoginEvent::getUserId);
//1.定义模式
Pattern<LoginEvent, LoginEvent> pattern = Pattern
.<LoginEvent>begin("fail")
.where(new SimpleCondition<LoginEvent>() {
@Override
public boolean filter(LoginEvent value) throws Exception {
return "fail".equalsIgnoreCase(value.getEventType());
}
})
.timesOrMore(2).consecutive()
.until(new SimpleCondition<LoginEvent>() {
@Override
public boolean filter(LoginEvent value) throws Exception {
return "success".equalsIgnoreCase(value.getEventType());
}
})
.within(Time.seconds(2));
//2.把模式应用在流上
PatternStream<LoginEvent> ps = CEP.pattern(loginKS, pattern);
//3.获取数据
ps.select(new PatternSelectFunction<LoginEvent, String>() {
@Override
public String select(Map<String, List<LoginEvent>> pattern) throws Exception {
return pattern.get("fail").toString();
}
}).print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
五、订单支付实时监控
在电商网站中,订单的支付作为直接与营销收入挂钩的一环,在业务流程中非常重要。
对于订单而言,为了正确控制业务流程,也为了增加用户的支付意愿,网站一般会设置一个支付失效时间,超过一段时间不支付的订单就会被取消。
另外,对于订单的支付,我们还应保证用户支付的正确性,这可以通过第三方支付平台的交易数据来做一个实时对账。
public class P07_Error_Order {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
WatermarkStrategy<OrderEvent> wms = WatermarkStrategy
.<OrderEvent>forBoundedOutOfOrderness(Duration.ofSeconds(20))
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent>() {
@Override
public long extractTimestamp(OrderEvent element, long recordTimestamp) {
return element.getEventTime();
}
});
env
.readTextFile("input/OrderLog.csv")
.map(line -> {
String[] datas = line.split(",");
return new OrderEvent(
Long.valueOf(datas[0]),
datas[1],
datas[2],
Long.parseLong(datas[3]) * 1000);
})
.assignTimestampsAndWatermarks(wms)
.keyBy(OrderEvent::getOrderId)
.process(new KeyedProcessFunction<Long, OrderEvent, String>() {
private ValueState<OrderEvent> createState;
private ValueState<OrderEvent> payState;
@Override
public void open(Configuration parameters) throws Exception {
createState = getRuntimeContext().getState(new ValueStateDescriptor<OrderEvent>("createState", OrderEvent.class));
payState = getRuntimeContext().getState(new ValueStateDescriptor<OrderEvent>("payState", OrderEvent.class));
}
@Override
public void processElement(OrderEvent value,
Context ctx,
Collector<String> out) throws Exception {
//订单创建和支付状态都为空时,表示订单信息第一次进入
//注册定时器
if (createState.value()==null && payState.value()==null){
ctx.timerService().registerEventTimeTimer(value.getEventTime()+30*60*1000);
}else {
//表示第二次进入
long timer = (createState.value()!=null ? createState.value().getEventTime()
: payState.value().getEventTime())+ 30*60*1000;
ctx.timerService().deleteEventTimeTimer(timer);
}
String eventType = value.getEventType();
if ("create".equalsIgnoreCase(eventType)){//创建订单
if (payState.value()==null){//支付订单还没来
createState.update(value);
}else {
//判断支付时间有没有超时
if (payState.value().getEventTime() - value.getEventTime() < 15*60*1000){
out.collect("订单:"+value.getOrderId()+"正常支付");
}else {
out.collect("订单:"+value.getOrderId()+"超时支付,存在系统bug");
}
}
}else {//支付信息
if (createState.value()==null){
//支付信息先到,订单信息还没有到
payState.update(value);
}else {
//判断支付有没有超时
if (value.getEventTime() - createState.value().getEventTime() < 15*60*1000){
out.collect("订单:"+value.getOrderId()+"正常支付");
}else {
out.collect("订单:"+value.getOrderId()+"超时支付,存在系统bug");
}
}
}
}
@Override
public void onTimer(long timestamp,
OnTimerContext ctx,
Collector<String> out) throws Exception {
if (createState.value()==null){
//有下单没有支付
out.collect("订单:"+createState.value().getOrderId()+" 有下单但是没有支付");
}else {
out.collect("订单:"+createState.value().getOrderId()+" 有支付信息没有下单信息,存在系统bug");
}
}
})
.print();
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}
六、Maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com</groupId>
<artifactId>atguigu</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<flink.version>1.12.0</flink.version>
<java.version>1.8</java.version>
<scala.binary.version>2.11</scala.binary.version>
<slf4j.version>1.7.30</slf4j.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.14.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.16</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.75</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-redis -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-redis_2.11</artifactId>
<version>1.1.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-elasticsearch6 -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch6_2.11</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-cep_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
七、重要点
7.1 计算明日0时的毫秒值
//获取当前日期毫秒值
final Long now = ctx.timestamp() / 1000;
//根据当前日期毫秒值获取当前日期
final LocalDateTime today = LocalDateTime.ofEpochSecond(now, 0, ZoneOffset.ofHours(8));
//获取明日0时0分0秒的毫秒值
final LocalDateTime tomorrow =LocalDateTime.of(today.toLocalDate().plusDays(1),LocalTime.of(0, 0, 0));
//切换时区
final long time = tomorrow.toEpochSecond(ZoneOffset.ofHours(8));