测试数据:
2021-01-24 15:00:01,user1,act01,view
2021-01-24 15:00:02,user1,act01,view
2021-01-24 15:00:05,user1,act01,join
2021-01-24 15:00:02,user2,act01,view
2021-01-24 15:00:05,user2,act01,join
2021-01-24 15:00:02,user3,act02,view
2021-01-24 15:00:05,user3,act02,join
2021-01-24 16:00:02,user3,act01,view
2021-01-24 16:00:05,user3,act01,join
需求说明:
需求:按天、按小时、统计各个活动、不同事件的次数和人数
不要要再使用HashSet作为去重的集合了,而是使用布隆过滤器
2021-01-24,act01,view,4,3
2021-01-24,act01,join,3,3
2021-01-24,act02,view,1,1
2021-01-24,act02,join,1,1
按照小时:
2021-01-24 15,act01,view,3,2
2021-01-24 16,act01,view,1,1
State 设置 TTL
实体类:
public class ActivityBean {
public String time;
public String uid;
public String aid;
public String eid;
public Long disCount; //去重的次数
public Long count; //未去重的次数
public ActivityBean(String time, String uid, String aid, String eid) {
this.time = time;
this.uid = uid;
this.aid = aid;
this.eid = eid;
}
@Override
public String toString() {
return "ActivityBean{" +
"time='" + time + '\'' +
", uid='" + uid + '\'' +
", aid='" + aid + '\'' +
", eid='" + eid + '\'' +
", disCount=" + disCount +
", count=" + count +
'}';
}
public static ActivityBean of(String time, String uid, String aid, String eid) {
return new ActivityBean(time, uid, aid, eid);
}
}
主函数:
public class AdvActivityCount {
public static void main(String[] args) throws Exception {
ParameterTool parameterTool = ParameterTool.fromPropertiesFile(args[0]);
DataStream<String> lines = FlinkUtils.createKafkaStream(parameterTool, SimpleStringSchema.class);
SingleOutputStreamOperator<ActivityBean> beanDataStream = lines.process(new ProcessFunction<String, ActivityBean>() {
@Override
public void processElement(String value, Context ctx, Collector<ActivityBean> out) throws Exception {
String[] fields = value.split(",");
String time = fields[0];
String uid = fields[1];
String aid = fields[2];
String eid = fields[3];
ActivityBean bean = ActivityBean.of(time, uid, aid, eid);
out.collect(bean);
}
});
KeyedStream<ActivityBean, Tuple2<String, String>> keyedStream = beanDataStream.keyBy(new KeySelector<ActivityBean, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> getKey(ActivityBean value) throws Exception {
return Tuple2.of(value.uid, value.eid);
}
});
SingleOutputStreamOperator<ActivityBean> result = keyedStream.process(new MyActivityCountFunction());
result.print();
FlinkUtils.env.execute();
}
private static class MyActivityCountFunction extends KeyedProcessFunction<Tuple2<String, String>, ActivityBean, ActivityBean> {
//按小时未去重的次数
private transient MapState<String, Long> hourUserState;
//按小时重的次数
private transient MapState<String, Long> hourDisUserState;
//判断按小时去重的Bloom过滤器
private transient MapState<String, BloomFilter<String>> hourBloomFilterState;
@Override
public void open(Configuration parameters) throws Exception {
MapStateDescriptor<String, Long> hourStateDescriptor = new MapStateDescriptor<>("hour-user-count-state", String.class, Long.class);
//(2021-01-24 16 -> 2000)
//(2021-01-24 17 -> 2000)
hourUserState = getRuntimeContext().getMapState(hourStateDescriptor);
MapStateDescriptor<String, Long> hourDisStateDescriptor = new MapStateDescriptor<>("hour-dis-user-count-state", String.class, Long.class);
//(2021-01-24 16 -> 1000)
//(2021-01-24 17 -> 1000)
hourDisUserState = getRuntimeContext().getMapState(hourDisStateDescriptor);
StateTtlConfig hourBloomStateTtlConfig = StateTtlConfig.newBuilder(Time.minutes(90))
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
.build();
MapStateDescriptor<String, BloomFilter<String>> hourBloomDescriptor = new MapStateDescriptor<>(
"hour-bloom-state",
TypeInformation.of(String.class),
TypeInformation.of(new TypeHint<BloomFilter<String>>() {
})
);
hourBloomDescriptor.enableTimeToLive(hourBloomStateTtlConfig);
hourBloomFilterState = getRuntimeContext().getMapState(hourBloomDescriptor);
}
@Override
public void processElement(ActivityBean value, Context ctx, Collector<ActivityBean> out) throws Exception {
String uid = value.uid;
//未去重的
String dayAndHour = value.time.substring(0, 13);
Long hourCount = hourUserState.get(dayAndHour);
if (hourCount == null) {
hourCount = 0L;
}
hourCount += 1;
hourUserState.put(dayAndHour, hourCount);
//去重的
BloomFilter<String> hourBloomFilter = hourBloomFilterState.get(dayAndHour);
Long hourDisCount = hourDisUserState.get(dayAndHour);
if (hourBloomFilter == null) {
hourBloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
hourDisCount = 0L;
}
//判断这样用户在这个小时内是否存在
if (!hourBloomFilter.mightContain(uid)) {
hourDisCount += 1L;
hourBloomFilter.put(uid);
}
hourDisUserState.put(dayAndHour, hourDisCount);
hourBloomFilterState.put(dayAndHour, hourBloomFilter);
value.time = dayAndHour;
value.count = hourCount;
value.disCount = hourDisCount;
out.collect(value);
}
}
}