drop table t_simhash_real;
create table t_simhash_real(
bitstr_id bigserial not null primary key,
bitstr character varying(256),
doc_id bigint,
bit_count integer,
bit_count_half integer,
thread_id integer,
grade integer,
update_time integer);
create index on t_simhash_real(update_time);
insert into t_simhash_real(bitstr,doc_id,bit_count,bit_count_half,thread_id,grade,update_time)
select md5(random()::text),t as t1,t as t2,t as t3,t as t4,t as t5,t as t6 from generate_series(1,3000000) as t;
--开发sql
delete from t_simhash_real where doc_id not in (select doc_id from t_simhash_real order by update_time desc limit 300000);
--优化后sql
去掉order by等,也不要自查询,直接指定时间条件删除
1, delete from t_simhash_real where update_time < 1574928643
--开发sql2
典型的子查询删除,两表表都需要做全表扫描,表越大删除越慢
delete from t_similarity_pair where doc_id not in (select doc_id from t_simhash_real);
--优化后
思路是先做join获取符合删除条件的数据,再删除,但通过指定要删除数据的物理地址,这样删除,更快。
1,
先获取指定删除条件的数据
select a.doc_id from t_similarity_pair a left join t_simhash_real b on a.doc_id=b.doc_id where b.doc_id is null;
比原来的好,但还是慢
delete from t_similarity_pair where doc_id in (select a.doc_id from t_similarity_pair a left join t_simhash_real b on a.doc_id=b.doc_id where b.doc_id is null);
更快
delete from t_similarity_pair where ctid = any(array(select a.ctid from t_similarity_pair a left join t_simhash_real b on a.doc_id=b.doc_id where b.doc_id is null));