网络爬虫之布隆过滤器

一.什么是布隆过滤器

    利用内存中的一个长度是m的位数组B,对其中所有位都置0,位数组的初始状态是每个位的值都是0。然后根据k个不同的散列函数,对每个遍历过的URL执行散列,每次散列的结果都是不大于m的一个整数a。根据散列得到的数在位数组B对应的位上置1,也就是让B[a]=1。

    每次插入一个爬过的URL,也执行k次散列,只有当全部位都已经置1了才认为这个URL已经遍历过。

 

二.代码实现(Java)

    BloomFilter.java

package com.rumo.foundation;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.concurrent.atomic.AtomicInteger;

public class BloomFilter implements Serializable{

	private static final long serialVersionUID = 923464509229483197L;
	private final int[] seeds;
	private final int size;
	private final BitSet bitset;
	private final MisJudgeRate rate;
	private final AtomicInteger useCount = new AtomicInteger(0);
	private final Double autoClearRate;
	
	/**
	 * 默认中等程序的误判率:MisjudgmentRate.MIDDLE 以及不自动清空数据
	 * @param dataCount
	 */
	public BloomFilter(int dataCount){
		this(MisJudgeRate.MIDDLE, dataCount, null);
	}
	
	/**
	 * 
	 * @param rate 枚举类型的误判率
	 * @param dataCount 预期处理的数据规模
	 * @param autoClearRate 
	 * 		自动清空过滤器内部信息的使用比率,传null则表示不会自动清理,
	 * 		当过滤器使用率达到100%时,则无论传入什么数据,都会认为在数据已经存在,
	 * 		当希望过滤器使用率达到80%时自动清空重新使用,则传入0.8	
	 */
	public BloomFilter(MisJudgeRate rate,int dataCount,Double autoClearRate){
		long bitSize = rate.seeds.length*dataCount;
		if(bitSize<0 || bitSize>Integer.MAX_VALUE)
			throw new RuntimeException("DataCount too large,Overflow!");
		
		this.rate = rate;
		this.seeds = rate.getSeeds();
		this.size = (int)bitSize;
		this.bitset = new BitSet(size);
		this.autoClearRate = autoClearRate;
	}
	
	/**
	 * 存在返回true,不存在记录返回false
	 * @param data
	 * @return
	 */
	public boolean addIfNotExist(String data){
		checkNeedClear();// 检查是否需要清空重新使用
		int[] indexs = new int[this.seeds.length];
		boolean exist = true;// assign it exist
		int index;
		
		for (int i = 0; i < this.seeds.length; i++) {
			index = hash(data, this.seeds[i]);
			indexs[i] = index;
			if(exist){
				if(!bitset.get(index)){
					// 有一个不存在则认为第一次出现
					exist = false;
					for (int j = 0; j <= i; j++) {
						setTrue(indexs[j]);
					}
				}
			}else{
				setTrue(index);
			}
		}
		return exist;
	}
	
	public void setTrue(int index){
		useCount.incrementAndGet();
		bitset.set(index,true);
	}
	
	private void checkNeedClear(){
		if(this.autoClearRate!=null){
			if(getUseRate() >= this.autoClearRate){
				synchronized (this) {
					if(getUseRate() >= this.autoClearRate){
						bitset.clear();
						useCount.set(0);
					}
				}
			}
		}
	}
	
	private int hash(String data,int seeds){
		char[] value = data.toCharArray();
		int hash = 0;
		if(value.length>0){
			for (int i = 0; i < value.length; i++) {
				hash = i*hash + value[i];
			}
		}
		hash = hash*seeds%this.size;
		return Math.abs(hash);
	}
	
	public void add(String data){
		checkNeedClear();
		for (int i = 0; i < this.seeds.length; i++) {
			int index = hash(data, this.seeds[i]);
			setTrue(index);
		}
	}
	
	public boolean check(String data){
		for (int i = 0; i < this.seeds.length; i++) {
			int index = hash(data, this.seeds[i]);
			if(!this.bitset.get(index))
				return false;
		}
		return true;
	}
	
	public double getUseRate(){
		return (double)useCount.intValue() / (double)size;
	}
	
	/**
	 * 清空过滤器中记录信息
	 */
	public void clear(){
		this.useCount.set(0);
		this.bitset.clear();
	}
	
	public void saveFilter(String path){
		try {
			ObjectOutputStream oos = 
					new ObjectOutputStream(new FileOutputStream(path));
			oos.writeObject(this);
		} catch (Exception e) {
			throw new RuntimeException("Save FilterObject to file Failure!");
		}
				
	}
	
	public static BloomFilter readFilter(String path){
		try {
			ObjectInputStream ois = 
					new ObjectInputStream(new FileInputStream(path));
			return (BloomFilter) ois.readObject();
		} catch (Exception e) {
			throw new RuntimeException("Read Filter from ObjectFile Failure!");
		}
	}
	
	public MisJudgeRate getRate() {
		return rate;
	}

	public enum MisJudgeRate{
		
		/**
		 * 分配4个位,误判率约为:0.14689159766308
		 */
		VERY_SMALL(new int[]{2,3,5,7}),
		
		/**
		 * 分配8个位,误判率约为:0.02157714146322
		 */
		SMALL(new int[]{2,3,5,7,11,13,17,19}),
		
		/**
		 * 分配16个位,误判率约为:0.00046557303372
		 */
		MIDDLE(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53}),
		
		/**
		 * 分配32个位,误判率约为:0.00000021167340
		 */
		HIGH(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,
				59,61,67,71,73,79,83,89,97,101,193,107,109,113,127,131});
		
		
		private int[] seeds;
		
		private MisJudgeRate(int[] seeds){
			this.seeds = seeds;
		}
		
		public int[] getSeeds(){
			return seeds;
		}
		
		public void setSeeds(int[] seeds){
			this.seeds = seeds;
		}
		
	}
	
	public static void main(String[] args) {
		
		BloomFilter urlBloomFilter = new BloomFilter(5);
		System.out.println(urlBloomFilter.addIfNotExist("www.rumoss.cn"));
		System.out.println(urlBloomFilter.addIfNotExist("www.sohu.com"));
		System.out.println(urlBloomFilter.addIfNotExist("www.sina.com"));
		System.out.println(urlBloomFilter.getUseRate());
		System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn"));
		System.out.println(urlBloomFilter.getUseRate());
		
		String savePath = "D:\\bloomFilter.obj";
		urlBloomFilter.saveFilter(savePath);
		
		urlBloomFilter = readFilter(savePath);
		System.out.println(urlBloomFilter.getUseRate());
		System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn"));
		
	}
	
	
}

    测试结果:

false
false
true
0.4
false
0.6
0.6
true

 

转载于:https://my.oschina.net/javamaster/blog/1610106

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值