场景:
- 字处理软件中,需要检查一个英语单词是否拼写正确
- 在 FBI,一个嫌疑人的名字是否已经在嫌疑名单上
- 在网络爬虫里,一个网址是否被访问过
- yahoo, gmail等邮箱垃圾邮件过滤功能
涉及到爬虫,肯定会涉及到去重问题,:海量数据查找元素是否存在。
布隆过滤原理:https://www.cnblogs.com/cpselvis/p/6265825.html
import java.util.BitSet;
/*
简单的Bloom过滤器实现
*/
public class BloomDemo {
private static final int SIZE = 1 << 24;
BitSet bitSet = new BitSet(SIZE);
Hash[] hashs = new Hash[8];
private static final int seeds[] = new int[] { 3, 5, 7, 9, 11, 13, 17, 19 };
public static void main(String[] args) {
System.out.println(SIZE);
String email = "https://blog.csdn.net/qq_40374604/article/details/88974732";
BloomDemo bloomDemo = new BloomDemo();
System.out.println(email + "是否在列表中: " + bloomDemo.contains(email));
bloomDemo.add(email);
System.out.println(email + "是否在列表中: " + bloomDemo.contains(email));
email = "https://www.imsilkroad.com/news/category/siluyiliao";
System.out.println(email + "是否在列表中: " + bloomDemo.contains(email));
}
public BloomDemo() {
for (int i = 0; i < seeds.length; i++) {
hashs[i] = new Hash(seeds[i]);
}
}
public void add(String string) {
for (Hash hash : hashs) {
System.out.println(hash.getHash(string));
bitSet.set(hash.getHash(string), true);
}
}
public boolean contains(String string) {
boolean have = true;
for (Hash hash : hashs) {
have &= bitSet.get(hash.getHash(string));
}
return have;
}
class Hash {
private int seed = 0;
public Hash(int seed) {
this.seed = seed;
}
public int getHash(String string) {
int val = 0;
int len = string.length();
for (int i = 0; i < len; i++) {
val = val * seed + string.charAt(i);
}
return val & (SIZE - 1);
}
}
}
有什么想法留言区留言