[sql] view plain copy
-- 环境:64位11.2G
-- 一、建立测试表,生成2000万测试数据,其中200万重复
CREATE TABLE test_t
(
id NUMBER (8) NOT NULL PRIMARY KEY,
name VARCHAR2 (32)
);
BEGIN
FOR i IN 1 .. 18000000
LOOP
INSERT INTO test_t (id, name)
VALUES (i, SYS_GUID ());
END LOOP;
COMMIT;
END;
/
INSERT INTO test_t (id, name)
SELECT 18000000 + (id / 9), name
FROM test_t
WHERE MOD (id, 9) = 0;
COMMIT;
CREATE INDEX idx_test_anme
ON test_t (name);
ANALYZE TABLE test_t COMPUTE STATISTICS FOR TABLE FOR ALL INDEXES FOR ALL INDEXED COLUMNS;
-- 二、两种去重方案对比执行时间
-- 方案1:建立中间表,生成辅助对象,删除原表,改表名
CREATE TABLE temp
AS
SELECT *
FROM test_t a
WHERE a.ROWID = (SELECT MIN (b.ROWID)
FROM test_t b
WHERE a.name = b.name);
-- 用时:08:14.79
CREATE INDEX idx
ON temp (name);
-- 用时:02:45.73
ALTER TABLE temp ADD CONSTRAINT temp_pk PRIMARY KEY (id);
-- 用时:01:15.79
-- 总用时:12分16秒
-- 方案2:直接删除原表中的重复数据
DELETE test_t a
WHERE a.ROWID > (SELECT MIN (b.ROWID)
FROM test_t b
WHERE a.name = b.name);
-- 用时:2个小时没出来,中断退出。
oralce 大数据去重
最新推荐文章于 2023-05-10 10:35:26 发布