布谷鸟过滤器,一种增强版的布隆过滤器,不同于布隆过滤器的是,存放一段hash的地方会多个位置,用于增加空间率用率,布谷鸟过滤器会有两个hash,异或算法,两个hash能找到相互的位置,用于其中一个被布谷鸟的蛋T走时的容错。
/**
* 布谷鸟过滤器
* @Description: CuckooFilter
* @Author: GeorgeYang
* @Date: 2022-06-07
* @Version:v1.0
*/
public class CuckooFilter {
/**
* 预期处理的数据量大小
*/
private int dataCount;
/**
* 位数组的数量大小
*/
private int bitCount;
private int bitSpace;
/**
* 鸟巢位置2
* 位数组。数组中的元素只能是 0 或者 1
*/
private BitSet nestPosition1;
/**
* 鸟巢位置2
* 位数组。数组中的元素只能是 0 或者 1
*/
private BitSet nestPosition2;
/**
* 存放包含 hash 函数的类的数组
*/
private SliceHash[] func;
/**
* 静态内部类。hash 函数
* 块hash
*/
public static class SliceHash {
private int bitIndex;//bitIndex
private int fp;
private int intSliceSpace;//hash成整数,每段长多少
public SliceHash(int bitSpace, int bitIndex) {
this.bitIndex = bitIndex;
this.fp = bitSpace * bitIndex;
this.intSliceSpace = Integer.MAX_VALUE / bitSpace;
}
/**
* 计算 hash 值
*/
public int hash1(Object value) {
return Math.abs(bitIndex * intSliceSpace + (value.hashCode() << bitIndex) % intSliceSpace);
}
public int hash2(Object value) {
int hash1 = this.hash1(value);
int hash2 = hash1 ^ fp;
return Math.abs(hash2);
}
}
/**
*
* @param dataCount 预计存放总数量
* @param bitSpace 每个内容,可以被拆分分散在多少个位置里面选择存放,
*/
public CuckooFilter(int dataCount, int bitSpace) {
this.dataCount = dataCount;
this.bitSpace = bitSpace;
//每个字符分配32个位
this.bitCount = dataCount * bitSpace;
func = new SliceHash[bitSpace];//16个
for (int i = 0; i < bitSpace; i++) {
func[i] = new SliceHash(bitSpace, i);
}
this.nestPosition1 = new BitSet(bitCount);
this.nestPosition2 = new BitSet(bitCount);
}
public boolean add(Object value) {
if (contains(value))
return false;
for (SliceHash f : func) {
int hash1 = f.hash1(value);
nestPosition1.set(hash1, true);
nestPosition2.set(hash1, false);//T走同巢隔壁位置的蛋
int hash2 = f.hash2(value);
nestPosition1.set(hash2, false);//T走同巢隔壁位置的蛋
nestPosition2.set(hash2, true);
}
return true;
}
public boolean contains(Object value) {
for (SliceHash f : func) {
int hash1 = f.hash1(value);
int hash2 = f.hash2(value);
boolean inPosition1 = nestPosition1.get(hash1);
boolean inPosition2 = nestPosition2.get(hash2);
boolean existInOne = inPosition1 || inPosition2;
if (!existInOne)
//两个巢位都被T走了
return false;
}
return true;
}
public static void main(String[] args) throws Exception {
int dataCount = 2 << 24;
// int dataCount = 9000000;
System.out.println(dataCount);
CuckooFilter filter = new CuckooFilter(dataCount, 5);
int wrongCount = 0;
int addCount = 9999999;//1亿数据
for (int i = 0; i < addCount; i++) {
String value = "" + i;
boolean exist = filter.contains(value);
if (exist) {
wrongCount++;
}
boolean addSuccess = filter.add(value);
exist = filter.contains(value);
if (!exist)
throw new InterruptedException("添加后被认为不存在:" + value);
}
System.out.println("误判的数量:" + wrongCount);
System.out.println("误判率:" + (wrongCount * 100d / addCount) + "%");
int lostCount = 0;
int existCount = 0;
for (int i = 0; i < addCount; i++) {
String value = "" + i;
boolean exist = filter.contains(value);
if (exist) {
existCount++;
} else {
lostCount ++;
}
}
System.out.println("存留的数量:" + (existCount - wrongCount));
System.out.println("存留率:" + ((existCount - wrongCount) * 100d / addCount) + "%");
System.out.println("被T除的数量:" + lostCount);
System.out.println("丢失率:" + (lostCount * 100d / addCount) + "%");
}
}
测试添加1亿数据,运行结果:
误判的数量:876
误判率:0.008760000876000087%
存留的数量:9932122
存留率:99.32122993212299%
被T除的数量:67001
丢失率:0.6700100670010067%
还没有做扩容等操作,这里只是简单的实现。
参考:
https://zhuanlan.zhihu.com/p/462813998
https://blog.csdn.net/aaa_bbb_ccc_123_456/article/details/106055033