求每分钟里点击量前3的热门商品
数据示例:
UserBehavior1.csv
用户ID,商品ID,商品类目ID,用户行为,发生时间
58,16,5,fav,1569866397000
834,22,0,buy,1569866397000
56,33,0,cart,1569866397000
162,43,1,pv,1569866397000
由于数据过多不展示全部
HotItems.java
package com.xxxxx.flink.demo.topGoods;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import scala.Int;
import javax.annotation.Nullable;
import java.io.File;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
public class HotItems {
public static void main(String[] args) throws Exception {
// 创建 execution environment
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 告诉系统按照 EventTime 处理
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
// 为了打印到控制台的结果不乱序,我们配置全局的并发为1,改变并发对结果正确性没有影响
env.setParallelism(1);
DataStreamSource<String> textFile = env.readTextFile("data/UserBehavior1.csv");
// 创建数据源,得到 UserBehavior 类型的 流
SingleOutputStreamOperator<UserBehavior> ds = textFile.map(new MapFunction<String, UserBehavior>() {
@Override
public UserBehavior map(String value) throws Exception {
String[] split = value.split(",");
long userID = Long.valueOf(split[0]);
long itemID = Long.valueOf(split[1]);
int categoryId = Integer.valueOf(split[2]);
String behavior = split[3];
long timestap = Long.valueOf(split[4]);
return new UserBehavior(userID, itemID, categoryId, behavior, timestap);
}
});
// 抽取出时间和生成 watermark,水位线随时间而递增,即水位线和当前流中数据最大时间相等。
SingleOutputStreamOperator<UserBehavior> outputStreamOperator = ds.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<UserBehavior>() {
@Override
public long extractAscendingTimestamp(UserBehavior userBehavior) {
// 原始数据的时间,作为水位线
return userBehavior.timestamp ;
}
});
// 过滤出只有点击的数据
SingleOutputStreamOperator<UserBehavior> filterOutputStream = outputStreamOperator.filter(new FilterFunction<UserBehavior>() {
@Override
public boolean filter(UserBehavior userBehavior) throws Exception {
// 过滤出只有点击的数据
return userBehavior.behavior.equals("pv");
}
});
//按商品ID进行分组
KeyedStream<UserBehavior, Tuple> keyedStream = filterOutputStream.keyBy("itemId");
//每5分钟计算一下最近60分钟的数据
WindowedStream<UserBehavior, Tuple, TimeWindow> windowedStream = keyedStream.timeWindow(Time.minutes(60) ,Time.minutes(5));
//进行聚合计算.统计出每个商品的点击次数
SingleOutputStreamOperator<ItemViewCount> apply = windowedStream.apply(new WindowFunction<UserBehavior, ItemViewCount, Tuple, TimeWindow>() {
@Override
public void apply(Tuple key, TimeWindow window, Iterable<UserBehavior> input, Collector<ItemViewCount> out) throws Exception {
Long itemId = key.getField(0);
long sum = 0;
//统计每个商品ID点击的次数。
Iterator<UserBehavior> iterator = input.iterator();
while (iterator.hasNext()) {
sum++;
iterator.next();
}
out.collect(ItemViewCount.of(itemId, window.getEnd(), sum));
}
});
KeyedStream<ItemViewCount, Tuple> windowEnd = apply.keyBy("windowEnd");
//
SingleOutputStreamOperator<String> process = windowEnd.process(new TopNHotItems(3));
process.print();
env.execute("Hot Items Job");
}
}
TopNHotItems.java
package com.xxxxx.flink.demo.topGoods;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
/**
* 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串
*/
public class TopNHotItems extends KeyedProcessFunction<Tuple, ItemViewCount, String> {
private final int topSize;
public TopNHotItems(int topSize) {
this.topSize = topSize;
}
// 用于存储商品与点击数的状态,待收齐同一个窗口的数据后,再触发 TopN 计算
private ListState<ItemViewCount> itemState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ListStateDescriptor<ItemViewCount> itemsStateDesc = new ListStateDescriptor<>(
"itemState-state",
ItemViewCount.class);
itemState = getRuntimeContext().getListState(itemsStateDesc);
}
@Override
public void processElement(
ItemViewCount input,
Context context,
Collector<String> collector) throws Exception {
System.err.println(input);
// 每条数据都保存到状态中,即缓存起来.
itemState.add(input);
// 当wartermark超过注册时间,则触发 。
// 注册 windowEnd+1 的 EventTime Timer, 当触发时,说明收齐了属于windowEnd窗口的所有商品数据
context.timerService().registerEventTimeTimer(input.windowEnd + 1);
}
@Override
public void onTimer(
long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
int sum=0;
// 获取收到的所有商品点击量
List<ItemViewCount> allItems = new ArrayList<>();
for (ItemViewCount item : itemState.get()) {
//所有商品在这个时间窗口内的总点击量
sum += item.viewCount;
allItems.add(item);
}
// 清除本次窗口的缓存数据,释放空间
itemState.clear();
// 按照点击量从大到小排序
allItems.sort(new Comparator<ItemViewCount>() {
@Override
public int compare(ItemViewCount o1, ItemViewCount o2) {
return (int) (o2.viewCount - o1.viewCount);
}
});
// 将排名信息格式化成 String, 便于打印
StringBuilder result = new StringBuilder();
result.append("=============================================\n");
result.append("时间: ").append(new Timestamp(timestamp - 1));
result.append(" 总点击量: ").append(sum).append("\n");
for (int i = 0; i < allItems.size() && i < topSize; i++) {
ItemViewCount currentItem = allItems.get(i);
// No1: 商品ID=12224 浏览量=2413
result.append("No").append(i).append(":")
.append(" 商品ID=").append(currentItem.itemId)
.append(" 浏览量=").append(currentItem.viewCount)
.append("\n");
}
result.append("=============================================\n\n");
// 控制输出频率,模拟实时滚动结果
Thread.sleep(1000);
out.collect(result.toString());
}
}