1. 单字段重复
-
查询重复数据
-- 假设 ID 重复 SELECT ID FROM your_table GROUP BY ID HAVING COUNT(ID)>1;
-
去重,保留
ROWID
最小的数据DELETE FROM your_table a WHERE a.id IN ( SELECT ID FROM your_table GROUP BY ID HAVING COUNT( ID ) > 1 ) AND ROWID NOT IN (SELECT min( ROWID ) FROM your_table WHERE id IN GROUP BY id);
- 查询重复数据
-- 假设 ID, NAME, INFO SELECT ID,NAME,INFO,COUNT(*) FROM your_table GROUP BY ID,NAME,INFO HAVING COUNT(*)>1;
- 去重,保留
ROWID
最小的数据DELETE FROM your_table a WHERE (a.ID,a.NAME,a.INFO) IN ( SELECT ID,NAME,INFO FROM your_table GROUP BY ID,NAME,INFO HAVING COUNT(*) > 1 ) AND ROWID NOT IN (SELECT min(ROWID) FROM your_table GROUP BY ID,NAME,INFO HAVING COUNT(*) > 1 ) );
- 如果数据量达到千万级别或者亿级,如果使用
2
中的方法,速度超级慢,耗时超久。所以,建议使用DDL (Data Definition Language,就是操作表结构的语句)
方式,实测,前者约12h
未跑出,后者1h
内出结果。具体如下- 假设 表
students
中id
、name
和class
三列唯一确定一行数据,现在有数据总量3
亿,有重复数据,
要求:以上三列数据重复的,保留一条即可,这里,我们选择保留rowid
最小的一行数据-- 创建新表 students_t,并保存 students 中 rowid 最小的数据,这个数据就是去重后所需的数据 CREATE TABLE AS SELECT * FROM students_t WHERE ROWID IN (SELECT MIN(ROWID) FROM students GROUP BY ID,NAME,CLASS); -- 修改表名称,将students修改为 其它名称(如,stuents_1),以便于将表 stuents_t 名称修改为 students ALTER TABLE students RENAME TO students_1; ALTER TABLE students_t RENAME TO students;
- 如果数据库性能足够,可以加上并行,这样处理更快,比如
ALTER SESSION ENABLE PARALLEL DML; -- parallel(x, 16), x 表别名,16 并行数 CREATE TABLE students_t AS SELECT /*+parallel(x, 16)*/ * FROM students x WHERE ROWID IN (SELECT /*+parallel(x, 16)*/ MIN(ROWID) FROM students GROUP BY ID,NAME,CLASS); ALTER TABLE students RENAME TO students_1; ALTER TABLE students_t RENAME TO students;
- 假设 表