前言:
结合上个水位线知识点做出的题目案例给予以下代码作为参考。
例题:
1.创建Flink流处理环境。
//创建流环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.execute();
2.从“access.txt”文件中获取数据源。
资源中查询数据源
3.从中过滤出包含有page字段的数据。
//过滤数据,转换数据
SingleOutputStreamOperator<UserEvent> stream1 = stream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
//value是一个json字符串
JSONObject jsonObject = JSONObject.parseObject(value);
JSONObject page = jsonObject.getJSONObject("page");
JSONObject common = jsonObject.getJSONObject("common");
//只保留page和common不为空数据
return page != null && common != null;
}
}).map(new MapFunction<String, UserEvent>() {
@Override
public UserEvent map(String value) throws Exception {
//value是一个JSON字符串
JSONObject jsonObject = JSONObject.parseObject(value);
//先取uid
String uid = jsonObject.getJSONObject("common").getString("uid");
//再取page_id
String pageId = jsonObject.getJSONObject("page").getString("page_id");
//取ts
Long ts = jsonObject.getLong("ts");
return new UserEvent(uid, pageId, ts);
}
});
4.设置时间戳及单调递增水位线。
SingleOutputStreamOperator<UserEvent> stream2 = stream1.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserEvent>forMonotonousTimestamps()
.withTimestampAssigner((event, ts) -> event.getTs())
);
5.统计每十秒钟的用户访客量pv,以及窗口开始时间和结束时间,并输出至控制台。
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream1001 = stream2.windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessAllWindowFunction<UserEvent, Tuple3<String, String, Long>, TimeWindow>() {
/**
*
* @param context 上下文对象 -- 拿到窗口信息
* @param iterable 10秒钟累积的数据
* @param collector 采集器
* @throws Exception
*/
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<Tuple3<String, String, Long>> collector) throws Exception {
TimeWindow window = context.window();//窗口对象
long start = window.getStart();//窗口开始时间戳
long end = window.getEnd();//窗口结束时间戳
long pv = 0L;
for (UserEvent event : iterable) {
pv++;//统计页面浏览量
}
//格式话成字符串输出
String strStart = DateFormatUtils.format(start, "yyyy-MM-dd HH:mm:ss");
String strEnd = DateFormatUtils.format(end, "yyyy-MM-dd HH:mm:ss");
//采集器输出
collector.collect(Tuple3.of(strStart, strEnd, pv));
}
});
stream1001.print();
6.统计每十秒钟的独立用户访客量uv,以及窗口开始时间和结束时间,并输出至控制台。
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream1002 = stream2.keyBy(value -> value.getUid()).window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessWindowFunction<UserEvent, Tuple3<String, String, Long>, String, TimeWindow>() {
/**
*
* @param s 键名(组名)
* @param context
* @param iterable
* @param collector
* @throws Exception
*/
@Override
public void process(String s, Context context, Iterable<UserEvent> iterable, Collector<Tuple3<String, String, Long>> collector) throws Exception {
TimeWindow window = context.window();//窗口对象
long start = window.getStart();//窗口开始时间戳
long end = window.getEnd();//窗口结束时间戳
long uv = 1;
//格式话成字符串输出
String strStart = DateFormatUtils.format(start, "yyyy-MM-dd HH:mm:ss");
String strEnd = DateFormatUtils.format(end, "yyyy-MM-dd HH:mm:ss");
//采集器输出三元组
collector.collect(Tuple3.of(strStart, strEnd, 1L));
}
}).keyBy(value -> value.f0).sum(2);
stream1002.print();
7.根据窗口开始和结束时间,使用join或者union将pv,uv数据进行汇总。
stream1001.join(stream1002)
.where(value1 -> value1.f0)
.equalTo(value2 -> value2.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.apply(new JoinFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, String>() {
@Override
public String join(Tuple3<String, String, Long> first, Tuple3<String, String, Long> second) throws Exception {
return "开始时间:"+first.f0+"结束时间:"+first.f1+",PV="+first.f2+",UV="+second.f2;
}
}).print();
8.统计每十秒钟较前十秒PV的变化,如果增加则输出正数,减少则输出负数
//另外编写一个方法 调用即可
public static class pvSumFunction extends ProcessAllWindowFunction<UserEvent, String, TimeWindow>{
private transient ValueState<Long> lastPvValueState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//定义描述器对象
ValueStateDescriptor<Long> descriptor = new ValueStateDescriptor<Long>(
"lastPv", // the state name
// TypeInformation.of(new TypeHint<Long>() {}), // 描述类型
Types.LONG
);
//在这里初始化状态对象
lastPvValueState = getRuntimeContext().getState(descriptor);
}
//处理函数
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<String> collector) throws Exception {
//统计当前窗口的pv
Long pv = 0L;
for (UserEvent event : iterable) {
pv++;
}
//上一个窗口的pv
Long lastPv = 0L;
Long value = lastPvValueState.value();
if (value!=null){
lastPv = value;
}
//差值
long chaZhi = pv - lastPv;
//把当前窗口的pv写回去
lastPvValueState.update(pv);
//输出结果(开始时间,结束时间)
String start = DateFormatUtils.format(context.window().getStart(), "yyyy-MM-dd HH:mm:ss");
String ebd = DateFormatUtils.format(context.window().getEnd(), "yyyy-MM-dd HH:mm:ss");
collector.collect("["+start+"-->"+ebd+")的pv是:"+pv+"-->与上一个10秒的差值是"+chaZhi);
}
}
9.统计每个十秒pv的前三值
public static class pvTopNFunction extends ProcessAllWindowFunction<UserEvent, String, TimeWindow>{
//所有窗口的pv值都存里面
private transient MapState<Long,Long> pvMapState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//初始化
pvMapState = getRuntimeContext().getMapState(new MapStateDescriptor<Long, Long>(
"pvMapState",
Types.LONG,
Types.LONG
));
}
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<String> collector) throws Exception {
//map是通过键来存取的
//拿到窗口开始的时间戳
long start = context.window().getStart();
//计算本窗口的pv
Long pv=0L;
for (UserEvent event : iterable) {
pv++;
}
//获取pvMapState里面的值
pvMapState.put(start,pv);
//排序并且获取Top3
Iterable<Map.Entry<Long, Long>> entries = pvMapState.entries();//取出map的键值对
//定义一个Map
Map<Long,Long>pvMap=new HashMap<>();
for (Map.Entry<Long, Long> entry : entries) {
pvMap.put(entry.getKey(),entry.getValue());
}
//排序(用的是Java里面的stream Api)
List<Map.Entry<Long, Long>> top3 = pvMap.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))//comparingByValue按值排序,reverseOrder降序排序
.limit(3)//取前3条
.collect(Collectors.toList());//采集
//转换为字符串输出
String rs="前三名是:\n";
for (Map.Entry<Long, Long> longLongEntry : top3) {
Long startMilli=longLongEntry.getKey();
String strStart = DateFormatUtils.format(startMilli, "yyyy-MM-dd HH:mm:ss");
rs+="开始时间是:"+strStart+",pv是:"+longLongEntry.getValue()+"\n";
}
collector.collect(rs);
}
}
代码总结:
public class UvPvDemo1 {
public static void main(String[] args) throws Exception {
//创建流环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置平行度
env.setParallelism(5);
//把处理方式设置为批处理模式
// env.setRuntimeMode(RuntimeExecutionMode.BATCH);
//读取文件
DataStreamSource<String> stream = env.readTextFile("D:\\workidea\\Flink1001\\input\\access.txt");
//过滤数据,转换数据
SingleOutputStreamOperator<UserEvent> stream1 = stream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
//value是一个json字符串
JSONObject jsonObject = JSONObject.parseObject(value);
JSONObject page = jsonObject.getJSONObject("page");
JSONObject common = jsonObject.getJSONObject("common");
//只保留page和common不为空数据
return page != null && common != null;
}
}).map(new MapFunction<String, UserEvent>() {
@Override
public UserEvent map(String value) throws Exception {
//value是一个JSON字符串
JSONObject jsonObject = JSONObject.parseObject(value);
//先取uid
String uid = jsonObject.getJSONObject("common").getString("uid");
//再取page_id
String pageId = jsonObject.getJSONObject("page").getString("page_id");
//取ts
Long ts = jsonObject.getLong("ts");
return new UserEvent(uid, pageId, ts);
}
});
//设置水位线策略
SingleOutputStreamOperator<UserEvent> stream2 = stream1.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserEvent>forMonotonousTimestamps()
.withTimestampAssigner((event, ts) -> event.getTs())
);
//开窗(统计每十秒钟的用户访客量pv,以及窗口开始时间和结束时间,并输出至控制台)
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream1001 = stream2.windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessAllWindowFunction<UserEvent, Tuple3<String, String, Long>, TimeWindow>() {
/**
*
* @param context 上下文对象 -- 拿到窗口信息
* @param iterable 10秒钟累积的数据
* @param collector 采集器
* @throws Exception
*/
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<Tuple3<String, String, Long>> collector) throws Exception {
TimeWindow window = context.window();//窗口对象
long start = window.getStart();//窗口开始时间戳
long end = window.getEnd();//窗口结束时间戳
long pv = 0L;
for (UserEvent event : iterable) {
pv++;//统计页面浏览量
}
//格式话成字符串输出
String strStart = DateFormatUtils.format(start, "yyyy-MM-dd HH:mm:ss");
String strEnd = DateFormatUtils.format(end, "yyyy-MM-dd HH:mm:ss");
//采集器输出
collector.collect(Tuple3.of(strStart, strEnd, pv));
}
});
// stream1001.print();
//(开始时间,结束数据,1)-》分组聚合(开始时间,结束时间,uv总和)
//(统计每十秒钟的独立用户访客量uv,以及窗口开始时间和结束时间,并输出至控制台)
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream1002 = stream2.keyBy(value -> value.getUid()).window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessWindowFunction<UserEvent, Tuple3<String, String, Long>, String, TimeWindow>() {
/**
*
* @param s 键名(组名)
* @param context
* @param iterable
* @param collector
* @throws Exception
*/
@Override
public void process(String s, Context context, Iterable<UserEvent> iterable, Collector<Tuple3<String, String, Long>> collector) throws Exception {
TimeWindow window = context.window();//窗口对象
long start = window.getStart();//窗口开始时间戳
long end = window.getEnd();//窗口结束时间戳
long uv = 1;
//格式话成字符串输出
String strStart = DateFormatUtils.format(start, "yyyy-MM-dd HH:mm:ss");
String strEnd = DateFormatUtils.format(end, "yyyy-MM-dd HH:mm:ss");
//采集器输出三元组
collector.collect(Tuple3.of(strStart, strEnd, 1L));
}
}).keyBy(value -> value.f0).sum(2);
// stream1002.print();
//
//(根据窗口开始和结束时间,使用join或者union将pv,uv数据进行汇总)
stream1001.join(stream1002)
.where(value1 -> value1.f0)
.equalTo(value2 -> value2.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.apply(new JoinFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, String>() {
@Override
public String join(Tuple3<String, String, Long> first, Tuple3<String, String, Long> second) throws Exception {
return "开始时间:"+first.f0+"结束时间:"+first.f1+",PV="+first.f2+",UV="+second.f2;
}
}).print();
env.execute();
}
}
public class UvPvDemo2 {
public static void main(String[] args) throws Exception {
//创建流环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置平行度
env.setParallelism(1);
//把处理方式设置为批处理模式
env.setRuntimeMode(RuntimeExecutionMode.BATCH);
//读取文件
DataStreamSource<String> stream = env.readTextFile("D:\\workidea\\Flink1001\\input\\access.txt");
//过滤数据,转换数据
SingleOutputStreamOperator<UserEvent> stream1 = stream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
//value是一个json字符串
JSONObject jsonObject = JSONObject.parseObject(value);
JSONObject page = jsonObject.getJSONObject("page");
JSONObject common = jsonObject.getJSONObject("common");
//只保留page和common不为空数据
return page != null && common != null;
}
}).map(new MapFunction<String, UserEvent>() {
@Override
public UserEvent map(String value) throws Exception {
//value是一个JSON字符串
JSONObject jsonObject = JSONObject.parseObject(value);
//先取uid
String uid = jsonObject.getJSONObject("common").getString("uid");
//再取page_id
String pageId = jsonObject.getJSONObject("page").getString("page_id");
//取ts
Long ts = jsonObject.getLong("ts");
return new UserEvent(uid, pageId, ts);
}
});
//设置水位线策略
SingleOutputStreamOperator<UserEvent> stream2 = stream1.assignTimestampsAndWatermarks(
WatermarkStrategy.<UserEvent>forMonotonousTimestamps()
.withTimestampAssigner((event, ts) -> event.getTs())
);
//统计每十秒钟较前十秒PV的变化,如果增加则输出正数,减少则输出负数
stream2
//滚动窗口
.windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
//滑动窗口
// .windowAll(SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(5)))
//会话窗口,需要指定会话中断间隔时间
// .windowAll(EventTimeSessionWindows.withGap(Time.seconds(3)))
.process(new pvTopNFunction()).print();
// stream2.print();
env.execute();
}
public static class pvSumFunction extends ProcessAllWindowFunction<UserEvent, String, TimeWindow>{
private transient ValueState<Long> lastPvValueState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//定义描述器对象
ValueStateDescriptor<Long> descriptor = new ValueStateDescriptor<Long>(
"lastPv", // the state name
// TypeInformation.of(new TypeHint<Long>() {}), // 描述类型
Types.LONG
);
//在这里初始化状态对象
lastPvValueState = getRuntimeContext().getState(descriptor);
}
//处理函数
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<String> collector) throws Exception {
//统计当前窗口的pv
Long pv = 0L;
for (UserEvent event : iterable) {
pv++;
}
//上一个窗口的pv
Long lastPv = 0L;
Long value = lastPvValueState.value();
if (value!=null){
lastPv = value;
}
//差值
long chaZhi = pv - lastPv;
//把当前窗口的pv写回去
lastPvValueState.update(pv);
//输出结果(开始时间,结束时间)
String start = DateFormatUtils.format(context.window().getStart(), "yyyy-MM-dd HH:mm:ss");
String ebd = DateFormatUtils.format(context.window().getEnd(), "yyyy-MM-dd HH:mm:ss");
collector.collect("["+start+"-->"+ebd+")的pv是:"+pv+"-->与上一个10秒的差值是"+chaZhi);
}
}
public static class pvTopNFunction extends ProcessAllWindowFunction<UserEvent, String, TimeWindow>{
//所有窗口的pv值都存里面
private transient MapState<Long,Long> pvMapState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//初始化
pvMapState = getRuntimeContext().getMapState(new MapStateDescriptor<Long, Long>(
"pvMapState",
Types.LONG,
Types.LONG
));
}
@Override
public void process(Context context, Iterable<UserEvent> iterable, Collector<String> collector) throws Exception {
//map是通过键来存取的
//拿到窗口开始的时间戳
long start = context.window().getStart();
//计算本窗口的pv
Long pv=0L;
for (UserEvent event : iterable) {
pv++;
}
//获取pvMapState里面的值
pvMapState.put(start,pv);
//排序并且获取Top3
Iterable<Map.Entry<Long, Long>> entries = pvMapState.entries();//取出map的键值对
//定义一个Map
Map<Long,Long>pvMap=new HashMap<>();
for (Map.Entry<Long, Long> entry : entries) {
pvMap.put(entry.getKey(),entry.getValue());
}
//排序(用的是Java里面的stream Api)
List<Map.Entry<Long, Long>> top3 = pvMap.entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))//comparingByValue按值排序,reverseOrder降序排序
.limit(3)//取前3条
.collect(Collectors.toList());//采集
//转换为字符串输出
String rs="前三名是:\n";
for (Map.Entry<Long, Long> longLongEntry : top3) {
Long startMilli=longLongEntry.getKey();
String strStart = DateFormatUtils.format(startMilli, "yyyy-MM-dd HH:mm:ss");
rs+="开始时间是:"+strStart+",pv是:"+longLongEntry.getValue()+"\n";
}
collector.collect(rs);
}
}
}