create table xxxx.test_collect_list_set(
first_level_directory int ,
second_level_directory int ,
third_level_directory int ,
order_id int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
NULL DEFINED AS ''
STORED AS ORC
tblproperties ('orc.compress'='SNAPPY');
insert overwrite into xxxx.test_collect_list_set
values
(1,2,3,1),
(1,2,3,1),
(1,2,3,4),
(1,2,4,1),
(1,2,4,5)
;
select first_level_directory,second_level_directory,collect_set(set_list) from (
select first_level_directory,
second_level_directory,
third_level_directory ,
concat_ws(',',collect_set(order_id )) set_list
from xxxx.test_collect_list_set
group by first_level_directory,second_level_directory,third_level_directory
) a
GROUP by first_level_directory,second_level_directory
Error:
Caused by: java.lang.ClassCastException: java.util.ArrayList cannot be cast to org.apache.hadoop.io.IntWritable
drop table xxxx.test_collect_list_set ;
SQL :
select first_level_directory,second_level_directory,udf(concat_ws(',',collect_set(set_list))) as set_list,udaf(udaf_set) as udaf_set from (
select first_level_directory,
second_level_directory,
third_level_directory ,
concat_ws(',',collect_set(cast(order_id as string))) set_list ,
udaf(order_id) udaf_set
from xxxx.test_collect_list_set
group by first_level_directory,second_level_directory,third_level_directory
) a
GROUP by first_level_directory,second_level_directory;
Result :
1 2 1,4,1,5 ["1,4","1,5"]
TIps :
I want to remove the duplicted data count from the back
the data should be become to as froming
1,4,1,5 ---> 1,4,5 (do not care about the sort)
May be the bast mothod is write a udf funciton using java method to remove he duplicted data
The other way is more complicated which suppliey by my colleges ,more info please refer to
https://blog.csdn.net/inte_sleeper/article/details/7196114
My another colleges give me this sql ,it works fine alone ,but if we add more columns it will give some errors
select collect_set(col) from (
select explode(split('1,2,3,4,1,2',',')) as col from test
) a;
Also we can using udaf to finish this work ,as talked with college it seems more easy for sql developers
errors like :
UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive 在统计汇总时候,如何去重
最新推荐文章于 2024-02-15 09:44:22 发布