背景:1000万手机号去重
方法:
1. 将手机号入数据库表中,设置主键唯一2. 用HashSet将访问过的手机号保存起来。那只需接近O(1)的代价就可以查到一个手机号是否重复
3. 手机号经过MD5或SHA-1等单向哈希后再保存到HashSet或数据库。
4. Bit-Map方法。建立一个BitSet,将手机号经过一个哈希函数映射到某一位。
其中
方法1比较耗IO,占用数据库资源
方法2,3比较占内存,很容易导致内存溢出
方法4消耗内存是相对较少但缺点是单一哈希函数发生冲突的概率太高
所以我们就用到了BloomFilter去重,好处就是占用内存少,重复率低
代码
package com.focus.wechat.phone;
import java.util.BitSet;
public class SimpleBloomFilter {
private static final int DEFAULT_SIZE = 2 << 24;
private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 };
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seeds.length];
public SimpleBloomFilter() {
for (int i = 0; i < seeds.length; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
public void add(String value) {
for (SimpleHash f : func) {
bits.set(f.hash(value), true);
}
}
public boolean contains(String value) {
if (value == null) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bits.get(f.hash(value));
}
return ret;
}
//内部类,simpleHash
public static class SimpleHash {
private int cap;
private int seed;
public SimpleHash(int cap, int seed) {
this.cap = cap;
this.seed = seed;
}
public int hash(String value) {
int result = 0;
int len = value.length();
for (int i = 0; i < len; i++) {
result = seed * result + value.charAt(i);
}
return (cap - 1) & result;
}
}
public static void main(String[] args) {
String value = "stone2083@yahoo.cn";
SimpleBloomFilter filter = new SimpleBloomFilter();
System.out.println(filter.contains(value));
filter.add(value);
System.out.println(filter.contains(value));
}
}
调用方法:
public static void deal1(){
System.out.println("开始内存:" + Runtime.getRuntime().freeMemory());
SimpleBloomFilter filter = new SimpleBloomFilter();
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter("D://phone//order//order1.txt"));
DBCollection collection = CopyOfMongoDBManager.getCollection("order_sms_info");
DBCursor cursor = collection.find();
if (null != cursor) {
int i = 0;
while (cursor.hasNext()) {
i ++;
if (i % 10000 == 0) {
System.out.println(i);
}
DBObject object = cursor.next();
String phone = (String)object.get("phone");
if (!filter.contains(phone)) {
filter.add(phone);
writer.write(phone+LINE);
}
}
cursor.close();
}
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("结束内存:" + Runtime.getRuntime().freeMemory());
}