1.查询任务ID的数量,分组后取个数大于1 的即有重复的
select *
from (select a.task_group_id, count(1) lk
from hc_temp_1 a
group by a.task_group_id)
where lk > 1
2.用row_num 除重,取时间最早或最新的一条数据,order by 做条件判断
row_number() over(partition by 字段 order by 字段 desc) row_no
select hc.nets_cust_id, hc.updated_date
from (select a.nets_cust_id,
a.updated_date,
row_number() over(partition by a.vehicle_no order by a.updated_date desc) row_no
from hc_temp_1 A
WHERE A.VEHICLE_NO = '苏1-23123') hc
where hc.row_no = 1;
--扩展:若取的第一条数据不是唯一的一行,有多条数据,如年龄最大的人可能会有多个,要用 rank() over()
select a.nets_cust_id,
a.updated_date,
rank() over(partition by a.nets_cust_id order by a.updated_date desc) row_no
from hc_0829_1 a
–创建多个临时表可减少执行时间
create table hc_temp_2 as
select `/*+parallel(a,20)*/`
a.*,
row_number() over(partition by a.客户ID order by a.出单时间 desc) row_no
from hc_temp_1 a;
create table hc_temp_3 as
select `/*+parallel(a,20)*/`
*
from hc_temp_2 a
where row_no = 1;
3.CSV格式的文件限制106W,超过会提示文件加载未完全,可用row_number 拆分数据
create table hc_temp_2 as
select /*+parallel(a,20)*/
a.*, ' ' aaa
from hc_temp_1 a;
create table hc_temp_3 as
select /*+parallel(a,20)*/
a.*, row_number() over(partition by aaa order by aaa) row_no
from hc_temp_2 a;
select * from hc_temp_3 where row_no >= '1' and row_no < '900000';
select * from hc_temp_3 where row_no >= '900000' and row_no < '1800000';
4.having count(1) 来找出重复的数据再删除掉
create table hc_temp_2 as
select /*+parallel(a,20)*/
a.客户号
from hc_temp_1 a
group by a.客户号
having count(1) >= '2';
delete from hc_temp_1 a
where a.客户号 in (select 客户号 from hc_temp_2 );
commit;
———————————————————— 扩展补充 ————————————————————
rank() over() 和dense_rank() over() 的区别
rank()是跳跃排序,有两个第1名时接下来就是第3名
1
1
3
4
dense_rank() over()是连续排序,有两个第1名时仍然跟着第2名
1
1
2
3