需求对id进行去重
create table test100
(
id int,
month string,
label int
)
row format delimited
fields terminated by ','
stored as textfile;
数据
133,201901,1
134,201812,1
133,201809,1
134,201803,1
133,201801,1
134,201801,1
加载数据
load data local inpath '/root/test100.ttx' into table test100;
distinct
distinct
只能对单个字段去重,完成不了这个需求
select distinct(id),month,label from test100;
133 201801 1
133 201809 1
133 201901 1
134 201801 1
134 201803 1
134 201812 1
group by
行不通
hive> SELECT id,month,label FROM test100 GROUP BY id;
FAILED: SemanticException [Error 10025]: Line 1:10 Expression not in GROUP BY key 'month'
row_number
with a as
(select id,month,label,row_number() over(distribute by id sort by month) `rn` from test100)
select id,month,label from a
where a.rn=1
;
133 201801 1
134 201801 1
总结
distinct
去重和group by
去重有很多局限,优选row_number
函数进行去重