Flink结合布隆过滤器(BloomFilter)去重项目实战

测试数据:

2021-01-24 15:00:01,user1,act01,view
2021-01-24 15:00:02,user1,act01,view
2021-01-24 15:00:05,user1,act01,join
2021-01-24 15:00:02,user2,act01,view
2021-01-24 15:00:05,user2,act01,join
2021-01-24 15:00:02,user3,act02,view
2021-01-24 15:00:05,user3,act02,join
2021-01-24 16:00:02,user3,act01,view
2021-01-24 16:00:05,user3,act01,join

需求说明:

需求:按天、按小时、统计各个活动、不同事件的次数和人数

不要要再使用HashSet作为去重的集合了,而是使用布隆过滤器

2021-01-24,act01,view,4,3
2021-01-24,act01,join,3,3
2021-01-24,act02,view,1,1
2021-01-24,act02,join,1,1

按照小时:
2021-01-24 15,act01,view,3,2
2021-01-24 16,act01,view,1,1

State 设置 TTL

实体类:

public class ActivityBean {

    public String time;

    public String uid;

    public String aid;

    public String eid;

    public Long disCount; //去重的次数
    public Long count; //未去重的次数

    public ActivityBean(String time, String uid, String aid, String eid) {
        this.time = time;
        this.uid = uid;
        this.aid = aid;
        this.eid = eid;
    }

    @Override
    public String toString() {
        return "ActivityBean{" +
                "time='" + time + '\'' +
                ", uid='" + uid + '\'' +
                ", aid='" + aid + '\'' +
                ", eid='" + eid + '\'' +
                ", disCount=" + disCount +
                ", count=" + count +
                '}';
    }

    public static ActivityBean of(String time, String uid, String aid, String eid) {
        return new ActivityBean(time, uid, aid, eid);
    }
}

主函数:

public class AdvActivityCount {

    public static void main(String[] args) throws Exception {

        ParameterTool parameterTool = ParameterTool.fromPropertiesFile(args[0]);

        DataStream<String> lines = FlinkUtils.createKafkaStream(parameterTool, SimpleStringSchema.class);

        SingleOutputStreamOperator<ActivityBean> beanDataStream = lines.process(new ProcessFunction<String, ActivityBean>() {
            @Override
            public void processElement(String value, Context ctx, Collector<ActivityBean> out) throws Exception {
                String[] fields = value.split(",");
                String time = fields[0];
                String uid = fields[1];
                String aid = fields[2];
                String eid = fields[3];
                ActivityBean bean = ActivityBean.of(time, uid, aid, eid);
                out.collect(bean);
            }
        });

        KeyedStream<ActivityBean, Tuple2<String, String>> keyedStream = beanDataStream.keyBy(new KeySelector<ActivityBean, Tuple2<String, String>>() {
            @Override
            public Tuple2<String, String> getKey(ActivityBean value) throws Exception {
                return Tuple2.of(value.uid, value.eid);
            }
        });

        SingleOutputStreamOperator<ActivityBean> result = keyedStream.process(new MyActivityCountFunction());

        result.print();

        FlinkUtils.env.execute();

    }


    private static class MyActivityCountFunction extends KeyedProcessFunction<Tuple2<String, String>, ActivityBean, ActivityBean> {

        //按小时未去重的次数
        private transient MapState<String, Long> hourUserState;

        //按小时重的次数
        private transient MapState<String, Long> hourDisUserState;

        //判断按小时去重的Bloom过滤器
        private transient MapState<String, BloomFilter<String>> hourBloomFilterState;


        @Override
        public void open(Configuration parameters) throws Exception {
            MapStateDescriptor<String, Long> hourStateDescriptor = new MapStateDescriptor<>("hour-user-count-state", String.class, Long.class);
            //(2021-01-24 16 -> 2000)
            //(2021-01-24 17 -> 2000)
            hourUserState = getRuntimeContext().getMapState(hourStateDescriptor);

            MapStateDescriptor<String, Long> hourDisStateDescriptor = new MapStateDescriptor<>("hour-dis-user-count-state", String.class, Long.class);
            //(2021-01-24 16 -> 1000)
            //(2021-01-24 17 -> 1000)
            hourDisUserState = getRuntimeContext().getMapState(hourDisStateDescriptor);

            StateTtlConfig hourBloomStateTtlConfig = StateTtlConfig.newBuilder(Time.minutes(90))
                    .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
                    .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
                    .build();

            MapStateDescriptor<String, BloomFilter<String>> hourBloomDescriptor = new MapStateDescriptor<>(
                    "hour-bloom-state",
                    TypeInformation.of(String.class),
                    TypeInformation.of(new TypeHint<BloomFilter<String>>() {
                    })
            );
            hourBloomDescriptor.enableTimeToLive(hourBloomStateTtlConfig);
            hourBloomFilterState = getRuntimeContext().getMapState(hourBloomDescriptor);

        }

        @Override
        public void processElement(ActivityBean value, Context ctx, Collector<ActivityBean> out) throws Exception {

            String uid = value.uid;
            //未去重的
            String dayAndHour = value.time.substring(0, 13);
            Long hourCount = hourUserState.get(dayAndHour);
            if (hourCount == null) {
                hourCount = 0L;
            }
            hourCount += 1;
            hourUserState.put(dayAndHour, hourCount);

            //去重的
            BloomFilter<String> hourBloomFilter = hourBloomFilterState.get(dayAndHour);
            Long hourDisCount = hourDisUserState.get(dayAndHour);
            if (hourBloomFilter == null) {
                hourBloomFilter = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100000);
                hourDisCount = 0L;
            }
            //判断这样用户在这个小时内是否存在
            if (!hourBloomFilter.mightContain(uid)) {
                hourDisCount += 1L;
                hourBloomFilter.put(uid);
            }
            hourDisUserState.put(dayAndHour, hourDisCount);
            hourBloomFilterState.put(dayAndHour, hourBloomFilter);


            value.time = dayAndHour;
            value.count = hourCount;
            value.disCount = hourDisCount;
            out.collect(value);

        }

    }
}

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

cts618

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值