由于一些原因造成hive数据层有重复数据需要进行动态去重的话,可以尝试如下操作来搞定:
根据单一字段来进行去重
-- 根据id字段来进行去重
SELECT id
,str
,dt
FROM (
SELECT *
,ROW_NUMBER() OVER (PARTITION BY id ORDER BY dt DESC ) AS r
FROM (
SELECT *
FROM VALUES
('o1','abcdefg','20220210')
,('o1','abcdefg','20220210')
,('o1','abc','20220210')
,('o1','abcdefg','20220211') as(id,str,dt)
) u
) b
WHERE r = 1
;
id str dt
o1 abcdefg 20220211
根据多个字段来进行去重
SELECT id
,str
,dt
FROM (
SELECT *
,ROW_NUMBER() OVER (PARTITION BY id,str ORDER BY dt DESC ) AS r
FROM (
SELECT *
FROM VALUES
('o1','abcdefg','20220210')
,('o1','abcdefg','20220210')
,('o1','abc','20220210')
,('o1','abcdefg','20220211') as(id,str,dt)
) u
) b
WHERE r = 1
;
id str dt
o1 abc 20220210
o1 abcdefg 20220211