flinkSQL数据去重

-- flinkSQL数据去重
-- ROW_NUMBER() OVER (PARTITION BY id ORDER BY proctime ASC) AS row_num 

drop table `my_hive`.`dl_test`.`a3`  ;

CREATE TABLE `my_hive`.`dl_test`.`a3` (
  `id` VARCHAR(2147483647) NOT NULL,
  `type` VARCHAR(2147483647) NOT NULL,
  `data` VARCHAR(2147483647) NOT NULL,
  proctime AS PROCTIME()
) WITH (
  'properties.bootstrap.servers' = 'xxxxx:9092',
  'connector' = 'kafka',
  'json.ignore-parse-errors' = 'true',
  'format' = 'json',
  'topic' = 'flink_test',
  'properties.group.id' = 'flink_test_02',
  'scan.startup.mode' = 'earliest-offset'
)
;

-- remove duplicate rows on order_id and keep the first occurrence row,
-- because there shouldn't be two orders with the same order_id.

my_hive`.`dl_test`.`a3`

select id,sum(data) as sum_data 
from (
SELECT id,type,cast(data  as int) as data 
FROM (
  SELECT *,
    ROW_NUMBER() OVER (PARTITION BY id ORDER BY proctime ASC) AS row_num
  FROM `my_hive`.`dl_test`.`a3`)
WHERE row_num = 1
) t 
group by id
;

-- kafka 数据准备 
{"id":"1","type":"1","data":"1"}
{"id":"1","type":"2","data":"1"}
{"id":"1","type":"3","data":"1"}
{"id":"1","type":"3","data":"2"}
{"id":"2","type":"1","data":"1"}
{"id":"3","type":"1","data":"1"}

结论:flinkSQL 流式数据去重,与批式逻辑一致

Flink 文档:去重 | Apache Flink

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值