差不多3亿6千万数据,需要去重。因为数据量太大,所以:
将数据load data infile到大表里,不进行任何去重操作,没有任何约束。然后将数据分成几十个小表,用这几十个小表去对比大表去重。得到去重后的小表。去重以后的小表,根据字段进行hash算出后两位数字,重新建好新表,将去重后小表的数据,插入到带有hash数字新表中。
存储过程如下(去重):
DELIMITER //
/*tblname 动态控制表名*/
CREATE PROCEDURE create_imsi(IN tblname varchar(200))
begin
declare age int default 1;
declare done int(1) default 0;
declare v_imsi varchar(200);
/*定义游标*/
declare cur_l cursor for select imsi from sqlstr;
/*定义异常*/
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' set done=1;
drop view if exists sqlstr;
/*定义视图*/
set @tbl = CONCAT("create view sqlstr as select a.imsi from tbl_new a,(select imsi from phone_",tblname," group by imsi having count(imsi) > 1) b where a.imsi = b.imsi group by imsi"