我们经常需要对一些重复数据进行处理,删除重复的数据,只保留一份即可。重复数据的原因很多:
1.应用上事先没有建立有效的唯一约束或主键
2.建立了唯一约束或主键,但是失效了
这里记录一个高效的处理方式,使用建立约束时的exceptions into功能,简单描述如下:
--1.构造千万级数据,并插入一些重复数据,共有42条重复,保留一份,则保留21条即可
dingjun123@ORADB> create table t as select * from dba_objects;
Table created.
Elapsed: 00:00:00.29
dingjun123@ORADB> insert into t select * from (select * from t order by dbms_random.value) where rownum<10;
9 rows created.
Elapsed: 00:00:00.17
dingjun123@ORADB> insert into t select * from (select * from t order by dbms_random.value) where rownum<3;
2 rows created.
Elapsed: 00:00:00.11
dingjun123@ORADB> /
2 rows created.
Elapsed: 00:00:00.12
dingjun123@ORADB> /
2 rows created.
Elapsed: 00:00:00.11
dingjun123@ORADB> /
2 rows created.
Elapsed: 00:00:00.11
dingjun123@ORADB> /
2 rows created.
Elapsed: 00:00:00.12
dingjun123@ORADB> /
2 rows created.
Elapsed: 00:00:00.11
dingjun123@ORADB> commit;
Commit complete.
dingjun123@ORADB> insert into t select owner,object_name,subobject_name,(select max(object_id) from t)+rownum,data_object_id,object_type,created,LAST_DDL_TIME,TIMESTAMP,STATUS,
2 TEMPORARY,GENERATED,SECONDARY,NAMESPACE,EDITION_NAME from t;
75313 rows created.
Elapsed: 00:00:00.15
dingjun123@ORADB> /
150626 rows created.
Elapsed: 00:00:00.27
dingjun123@ORADB> /
301252 rows created.
Elapsed: 00:00:00.59
dingjun123@ORADB> /
602504 rows created.
Elapsed: 00:00:03.50
dingjun123@ORADB> /
1205008 rows created.
Elapsed: 00:00:09.65
dingjun123@ORADB> commit;
Commit complete.
Elapsed: 00:00:00.00
dingjun123@ORADB> insert into t select owner,object_name,subobject_name,(select max(object_id) from t)+rownum,data_object_id,object_type,created,LAST_DDL_TIME,TIMESTAMP,STATUS,
2 TEMPORARY,GENERATED,SECONDARY,NAMESPACE,EDITION_NAME from t;
2410016 rows created.
Elapsed: 00:00:14.41
dingjun123@ORADB> /
4820032 rows created.
Elapsed: 00:00:41.13
dingjun123@ORADB> commit;
Commit complete.
Elapsed: 00:00:00.00
--2.关键,建立exceptions表,然后建立唯一约束,有重复的rowid信息自动放入exceptions表里,也就是42条重复记录的row_id会放到exceptions 表里
dingjun123@ORADB> create table exceptions(row_id rowid,
2 owner varchar2(30),
3 table_name varchar2(30),
4 constraint varchar2(30));
Table created.
Elapsed: 00:00:00.01
--可以加nologging,parallel提高效率,查找重复记录非常快
dingjun123@ORADB> alter table t add constraint uk_t unique(object_id,object_name,owner)
2 exceptions into EXCEPTIONS NOLOGGING PARALLEL 4 ;
alter table t add constraint uk_t unique(object_id,object_name,owner)
*
ERROR at line 1:
ORA-02299: cannot validate (DINGJUN123.UK_T) - duplicate keys found
Elapsed: 00:00:53.53
dingjun123@ORADB> select count(*) from t;
COUNT(*)
----------
9640064
1 row selected.
Elapsed: 00:00:01.56
--获得重复记录
dingjun123@ORADB> select count(*) from exceptions;
COUNT(*)
----------
42
1 row selected.
Elapsed: 00:00:00.01
--3.插入重复记录到临时表dups中
dingjun123@ORADB> create table dups
2 as
3 select *
4 FROM t
5 where rowid in ( select row_id from exceptions );
Table created.
Elapsed: 00:00:00.07
--4.删除所有重复记录
dingjun123@ORADB> delete from t where rowid in ( select row_id from
2 exceptions );
42 rows deleted.
Elapsed: 00:00:00.02
--5.重复记录保留一份
dingjun123@ORADB> insert into t
SELECT owner,object_name,subobject_name,(select max(object_id) from t)+rownum,data_object_id,object_type,created,LAST_DDL_TIME,TIMESTAMP,STATUS,
TEMPORARY,GENERATED,SECONDARY,NAMESPACE,EDITION_NAME
FROM (
select owner,object_name,subobject_name,object_id,data_object_id,object_type,created,LAST_DDL_TIME,TIMESTAMP,STATUS,
TEMPORARY,GENERATED,SECONDARY,NAMESPACE,EDITION_NAME,
row_number() OVER(PARTITION BY object_id,object_name,owner ORDER BY ROWID) rn
from dups
)WHERE rn=1;
21 rows created.
Elapsed: 00:00:00.01
dingjun123@ORADB> commit;
Commit complete.
Elapsed: 00:00:00.01