前言
flink 处理重复数据,经过实战,总结一下3个方式。
1. 去重方式一 流转表
核心代码逻辑
// 计算iopv
SingleOutputStreamOperator<FundIopvIndicators> streamOperator = EtfIopvFunction.calculateRealTimeIopv(stringKeyedStream);
// 流转表
Table table = tableEnv.fromDataStream(streamOperator, "fundCode,realTimeIopv,computationTime,strTime");
// 去重
Table duplicateRemoval = tableEnv.sqlQuery(" select strTime,fundCode,realTimeIopv,computationTime from ( "
+ " select strTime,fundCode,realTimeIopv,computationTime, ROW_NUMBER() OVER "
+ " (PARTITION BY strTime ORDER BY strTime desc) AS rownum from "
+ table
+ ") where rownum=1"
);
2 去重方式二 flink sql
--- 去重查询
-- kafka source
CREATE TABLE user_log (
user_id VARCHAR
,item_id VARCHAR
,category_id VARCHAR
,behavior INT
,ts TIMESTAMP(3)
,process_time as proctime()
, WATERMARK FOR ts AS ts
) WITH (
'connector' = 'kafka'
,'topic' = 'user_behavior'
,'properties.bootstrap.servers' = 'localhost:9092'
,'properties.group.id' = 'user_log'
,'scan.startup.mode' = 'group-offsets'
,'format' = 'json'
);
---sink table
CREATE TABLE user_log_sink (
user_id VARCHAR
,item_id VARCHAR
,category_id VARCHAR
,behavior INT
,ts TIMESTAMP(3)
,num BIGINT
,primary key (user_id) not enforced
) WITH (
'connector' = 'upsert-kafka'
,'topic' = 'user_behavior_sink'
,'properties.bootstrap.servers' = 'localhost:9092'
,'properties.group.id' = 'user_log'
,'key.format' = 'json'
,'key.json.ignore-parse-errors' = 'true'
,'value.format' = 'json'
,'value.json.fail-on-missing-field' = 'false'
,'value.fields-include' = 'ALL'
);
-- insert
insert into user_log_sink(user_id, item_id, category_id,behavior,ts,num)
SELECT user_id, item_id, category_id,behavior,ts,rownum
FROM (
SELECT user_id, item_id, category_id,behavior,ts,
ROW_NUMBER() OVER (PARTITION BY category_id ORDER BY process_time desc) AS rownum -- desc use the latest one,
FROM user_log)
WHERE rownum=1-- 只能使用 rownum=1,如果写 rownum=2(或<10),每个分区只会输出一条数据(小于是多条)rownum=2的,看起来基于全局去重了
3. 利用 flink 的 MapState
public class IopvDeduplicateProcessFunction extends RichFlatMapFunction<FundIopvIndicators, FundIopvIndicators> {
private MapState<String, FundIopvIndicators> mapState;
/***状态初始化*/
@Override
public void open(Configuration parameters) throws Exception {
MapStateDescriptor descriptor = new MapStateDescriptor("MapDescriptor", String.class, FundIopvIndicators.class);
mapState = getRuntimeContext().getMapState(descriptor);
}
@Override
public void flatMap(FundIopvIndicators iopvIndicators, Collector<FundIopvIndicators> collector) throws Exception {
String strTime = iopvIndicators.getStrTime();
// 去重
if (mapState.get(strTime) == null) {
mapState.put(strTime, iopvIndicators);
collector.collect(iopvIndicators);
}
}
}