基于论文的简单实现,实现了exists和insert逻辑,并没实现remove逻辑
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Random;
/**
* <a href="https://www.docin.com/p-1444301256.html?docfrom=rrela">参考论文</a>
*/
public class Filter {
private final static Logger LOG = LoggerFactory.getLogger(Filter.class);
//最大剔除次数,看了很多算法描述,都没说具体应该取值多少
private final static int MAX_KICKS = 100;
private final Random random = new Random();
private final double expectedTolerance; //错误率,根据这个计算f大小
private final int maxComponentSize;//最多能存放多少数据对象,根据这个计算bucket size
private final int fingerprintSize;//指纹长度(byte),也表示一个slot的大小
private final int bucketSize;//bucket的长度L,必须是2^x
private final byte[][] bucket;
public Filter(double expectedTolerance, int maxComponentSize) throws NoSuchAlgorithmException {
assert maxComponentSize > 0;
this.expectedTolerance = expectedTolerance;
this.maxComponentSize = maxComponentSize;
//1、b=4,根据f>=log2(2b/expectedTolerance),算出fingerprint的bit size
//NOTE:java不提供以2为底的函数,需要借助换底公式loge(A)/loge(B) = logB(A)实现
int minFingerprintSize, originMinFPSize;
originMinFPSize = minFingerprintSize = (int) (Math.ceil(Math.log(2 * 4 / expectedTolerance) / Math.log(2)));
/**
* 上面算出来的fingerprint长度是满足expectedTolerance的最小值,但还需要考虑一个问题,
* 那就是通常fingerprint的bit-length通常要远小于hash()算出来的hash-code-bit-len,
* 此时我们参考git的方式进行hash-code的高X位截断。
*
* 这里就有个问题了,高X位截断会导致hash collision的概率变高,而hash collision的冲突概率可以参考
* 这个链接:https://stackoverflow.com/questions/18134627/how-much-of-a-git-sha-is-generally-considered-necessary-to-uniquely-identify-a
* This is known as the birthday problem.
*
* For probabilities less than 1/2 the probability of a collision can be approximated as
*
* p ~= (n^2)/(2m)
*
* Where n is the number of items and m is the number of possibilities for each item.
*
* The number of possibilities for a hex string is 16c where c is the number of characters.
*
* So for 8 characters and 30K commits
*
* 30K ~= 2^15
*
* p ~= (n^2)/(2m) ~= ((2^15)^2)/(2*16^8) = 230/233 = ⅛
*
* Increasing it to 12 characters
*
* p ~= (n^2)/(2m) ~= ((2^15)^2)/(2*16^12) = 2^30/2^49 = 2-19
*
* 其中n=maxComponentSize,m=2^fingerBitLen
* 因此我们要保证P在一个可以接收的范围内,同时还要满足expectedTolerance的要求,
* 此时的minFingerprintSize究竟要是多少?
*
*
* NOTE: 这一步其实可以省略,因为cuckoo filter的元素定位通过两种hash algorithm实现,
* 两种同时冲突的概率微乎其微。
*
* 经过实际测试,计算p=0.1,真实的cuckoo filter tolerance已经达到2.8421709430404007E-14,
* 因此下面的步骤没有意义
*/
/*for (; ; ) {
double p = (Math.pow(maxComponentSize,2)) / (2 * Math.pow(2, minFingerprintSize));
if (p >= 0.1) { //万分之一的冲突率
LOG.info("指纹冲突概率不满足要求,扩展指纹长度");
minFingerprintSize += 1;
continue;
}
break;
}*/
int maxFgBitSize;
if (minFingerprintSize > (maxFgBitSize = MessageDigest.getInstance("MD5").getDigestLength() * 8))
throw new RuntimeException("指纹算法支持的最大长度为:" + maxFgBitSize + "(bit),但实际的指纹长度为" + minFingerprintSize + "(bit),请更换指纹算法");
//2、计算
int expectedBucketSize = (int) Math.ceil(maxComponentSize / 4.0); //每个bucket可以容纳4个slot,因此用户期望的最大容量下,实际只需要这个大小的数组捷克语存储
//NOTE:由于bucketSize必须是2^x,因此只需要将这个值的最高bit位左移1即可
//根据github的实现,大部分人都要保证计算出来的值,如果空间利用率达到某个值(这里b=4,最大空间利用率位95%),需要再乘2.
int x, j;
for (x = expectedBucketSize >> 1, j = 1; x > 0; x = x >> 1, j++) ;
int bucketSize = 1 << j;
if (maxComponentSize / (bucketSize * 4.0) > 0.90) {
bucketSize <<= 1;
}
this.bucketSize = bucketSize;//获得了数组的第一维len
//计算数组的第二维
//这里不使用紧凑算法,能简单很多,让每一个slot的大小一定是byte的整数倍
this.fingerprintSize = ((int) Math.ceil(minFingerprintSize / 8.0));
//真实的误差率≈2b/2^f
double realTolerance = 2 * 4 / (Math.pow(2, this.fingerprintSize * 8));
int dimension2 = this.fingerprintSize * 4;
int dimension2Compressed = (int) Math.ceil(minFingerprintSize * 4 / 8.0);
/**
* 这里注意:由于必须以byte为存储的基本单位,为了方便计算,我们以大端模式存储,
* 多余的bit位用0填充
*/
this.bucket = new byte[this.bucketSize][dimension2];
LOG.info("\n最大元素个数=[{}],期望错误率=[{}],真实误差率=[{}],\n" +
"bucketSize=[{}],初始指纹长度=[{}]bit,扩展后的指纹长度=[{}]bit\n" +
"非紧凑算法下,bucket[{}][{}],共消耗{}MB内存\n" +
"紧凑算法下,bucket[{}][{}],共消耗{}MB内存",
this.maxComponentSize, this.expectedTolerance, realTolerance,
this.bucketSize, originMinFPSize, this.fingerprintSize * 8,
this.bucketSize, dimension2, this.bucketSize * dimension2 * 1.0 / 1024 / 1024,
this.bucketSize, dimension2Compressed, this.bucketSize * dimension2Compressed * 1.0 / 1024 / 1024
);
}
private byte[] computeFingerprint(String component) throws NoSuchAlgorithmException {
MessageDigest sha384 = MessageDigest.getInstance("SHA-384");
//大端字节序,提取前X byte,也就是hash的前X个字节的内容(参考git的设计)
byte[] fgBytes = sha384.digest(component.getBytes(StandardCharsets.UTF_8));
byte[] fingerprint = new byte[this.fingerprintSize];
System.arraycopy(fgBytes, 0, fingerprint, 0, fingerprint.length);
return fingerprint;
}
//这个方法返回的byte[]长度一定小于4,且符号位一定是0
private int hash(byte[] originBytes) throws NoSuchAlgorithmException {
MessageDigest sha256 = MessageDigest.getInstance("SHA-256");
byte[] hashBytes = sha256.digest(originBytes);
BigInteger hashIntVal = new BigInteger(hashBytes);
BigInteger bucketSizeVal = new BigInteger(String.valueOf(this.bucketSize - 1));
BigInteger result = hashIntVal.and(bucketSizeVal);
return result.intValue();
}
private int bytesToInt(byte[] bytes) {
if (bytes == null || bytes.length == 0 || bytes.length > 4)
throw new RuntimeException("无法正常转换成int类型的数字");
//NOTE:这个byte可能不足4byte,但也是按照大端字节序存储的
int result = 0;
for (int i = bytes.length - 1, j = 0; i >= 0; i--, j++) {
result += bytes[i] << (j * 8);
}
return result;
}
private boolean compareSlots(byte[] fingerprint, byte[] slotsForIndex) {
label_1:
for (int i = 0; i < 4; i++) {
for (int j = 0; j < this.fingerprintSize; j++) {
if (fingerprint[j] != slotsForIndex[i * this.fingerprintSize + j])
continue label_1;
}
return true;
}
return false;
}
public boolean compnentExists(String component) throws NoSuchAlgorithmException {
byte[] fingerprint = this.computeFingerprint(component);
//1、计算index1和index2
int index1 = this.hash(component.getBytes(StandardCharsets.UTF_8));
int hashFingerprint = this.hash(fingerprint);
int index2 = index1 ^ hashFingerprint;
return this.compnentExists(fingerprint, index1, index2);
}
private boolean compnentExists(byte[] fingerprint, int index1, int index2) {
// LOG.info("index1={},index2={}", index1, index2);
return this.compareSlots(fingerprint, this.bucket[index1])
|| this.compareSlots(fingerprint, this.bucket[index2]);
}
public void insertCompnent(String component) throws NoSuchAlgorithmException {
byte[] fingerprint = this.computeFingerprint(component);
//1、计算index1和index2
int index1 = this.hash(component.getBytes(StandardCharsets.UTF_8));
int hashFingerprint = this.hash(fingerprint);
int index2 = index1 ^ hashFingerprint;
if (!this.compnentExists(fingerprint, index1, index2)) {
//不存在,插入
int slotStartIndex = -1;
if ((slotStartIndex = this.findFreeSlot(index1)) != -1) {
for (int i = 0; i < this.fingerprintSize; i++) {
this.bucket[index1][slotStartIndex + i] = fingerprint[i];
}
return;
} else if ((slotStartIndex = this.findFreeSlot(index2)) != -1) {
for (int i = 0; i < this.fingerprintSize; i++) {
this.bucket[index2][slotStartIndex + i] = fingerprint[i];
}
return;
} else {
int kickDimension1Index;
if (System.currentTimeMillis() % 2 == 0) {
kickDimension1Index = index1;
} else {
kickDimension1Index = index2;
}
for (int kickOutCounter = 0; kickOutCounter < MAX_KICKS; kickOutCounter++) {
//随机选择一个进行踢出
int kickDimension2Index = this.random.nextInt(4);
for (int i = 0; i < this.fingerprintSize; i++) {
byte tmp = this.bucket[kickDimension1Index][kickDimension2Index * this.fingerprintSize + i];
this.bucket[kickDimension1Index][kickDimension2Index * this.fingerprintSize + i] = fingerprint[i];
fingerprint[i] = tmp;
}
LOG.info("第[{}]次踢出操作,踢出的位置[{}][{}],被踢出的指纹=[{}]"
, kickOutCounter+1, kickDimension1Index, kickDimension2Index, Arrays.toString(fingerprint));
//计算被踢出元素的下一个位置
kickDimension1Index = this.hash(fingerprint) ^ kickDimension1Index;
//如果被踢出元素的下一个位置有空闲位置,则插入,否则,继续踢出逻辑
if ((slotStartIndex = this.findFreeSlot(kickDimension1Index)) != -1) {
for (int i = 0; i < this.fingerprintSize; i++) {
this.bucket[kickDimension1Index][slotStartIndex + i] = fingerprint[i];
}
return;
}
}
}
//超过剔除次数时,需要抛出异常
throw new RuntimeException("踢出次数超出上限");
}
LOG.info("元素已经存在");
}
/**
* 判断某个bucket的slot是否空闲,判断依据就是所有的都为0
*
* @param bucketIndex
* @return 返回空闲slot的起始index
*/
private int findFreeSlot(int bucketIndex) {
label1:
for (int i = 0; i < 4; i++) {
for (int j = 0; j < this.fingerprintSize; j++) {
//在一个slot中,有一个byte不是0,就认为存在指纹
if (this.bucket[bucketIndex][i * this.fingerprintSize + j] != 0)
continue label1;
}
//空闲slot的开始位置
return i * this.fingerprintSize;
}
return -1;
}
public static void main(String[] args) throws NoSuchAlgorithmException {
//错误率十万分之一,容量100W
Filter filter = new Filter(0.00001, 1_000_000);
/*System.out.println(filter.compnentExists("abc"));
System.out.println(filter.compnentExists("WANG起飞123"));
filter.insertCompnent("WANG起飞123");
filter.insertCompnent("WANG起飞123");
filter.insertCompnent("WANG起飞1234");
System.out.println(filter.compnentExists("abc"));
System.out.println(filter.compnentExists("WANG起飞123"));
System.out.println(filter.compnentExists("WANG起飞1234"));
System.out.println(filter.compnentExists("WANG起飞124"));*/
long startTime = System.currentTimeMillis();
int i = 0;
try {
for (; i < 1_000_000; i++) {
String str = i + "";
filter.insertCompnent(str);
}
} finally {
System.out.println("i=" + i);
}
System.out.println("耗时:" + (System.currentTimeMillis() - startTime + " ms"));
System.out.println(filter.compnentExists("19999"));
}
}