java通过布隆过滤器对ip地址进行过滤
add添加一条数据
public void add(String data){
//检测数据是否要刷新
checkNeedClear();
lock.lock();
try {
//保证对一个数据的操作是一个整体
for(int i = 0; i < seeds.length; i++){
int hashIndex = getHash(data,seeds[i]);
bitMap.set(hashIndex,true);
}
}finally {
lock.unlock();
}
}
check查询一条数据
public boolean check(String data){
lock.lock();
try {
for(int i = 0; i < seeds.length; i++){
int hashIndex = getHash(data,seeds[i]);
if(bitMap.get(hashIndex) == false){
return false;
}
}
return true;
}finally {
lock.unlock();
}
}
获取Hash值, 模仿字符串自带的hash值
public int getHash(String data, int base){
char[] value = data.toCharArray();
long res = 1;
for(int i = 0; i < value.length; i++){
res = (res * base + value[i]) % size;
}
return (int)(res%size);
}
主要项目结构
主要代码 JudgeRate枚举类
package com.cn.bl.enm;
/**
* @Author WangZhiHua
* @Description 错误率,每个字符串生成哈希函数的个数
* 分配的位数越高,误判率越低
* 分配四个哈希函数误判率大概为0.14 14%;
* 分配八个哈希函数误判率大概为(0.14*0.14) 2%左右
* 分配十六个哈希函数误判率大概为(0.14 * 0.14 * 0.14 * 0.14) 0.4%左右
* 分配三十二个哈希函数误判率大概为(0.002%左右)
*
* @Date 16:18 2023-04-14
* @Param
* @Return
**/
public enum JudgeRate {
/*一般生成哈希函数的 base选质数*/
BASE_4(new int[] { 2, 3, 5, 7 }),
/**
* 每个字符串分配8个位
*/
BASE_8(new int[] { 2, 3, 5, 7, 11, 13, 17, 19 }), //
/**
* 每个字符串分配16个位
*/
BASE_16(new int[] { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53 }), //
/**
* 每个字符串分配32个位
*/
BASE_32(new int[] { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97,
101, 103, 107, 109, 113, 127, 131 });
private int[] seeds;
public int[] getSeeds() {
return seeds;
}
public void setSeeds(int[] seeds) {
this.seeds = seeds;
}
private JudgeRate(int[] seeds){
this.seeds = seeds;
}
}
BloomFilter类
package com.cn.bl.util;
import com.cn.bl.enm.JudgeRate;
import org.springframework.stereotype.Component;
import java.util.BitSet;
import java.util.concurrent.locks.ReentrantLock;
/**
* @author: Wang
* @Date: 2023-04-14 16:27
* @Description: 布隆过滤器
*/
@Component
public class BloomFilter {
//base集合
private int[] seeds=null;
//位图
private BitSet bitMap=null;
//比率
private static JudgeRate rate= JudgeRate.BASE_16;
private static int dataCount=100000;//测试的数据量
private int size;
//加锁,保证添加数据,和查询数据是一个原子操作
private ReentrantLock lock = new ReentrantLock();
private static Double autoClear=0.95;
public BloomFilter(){
long bitSize = rate.getSeeds().length * dataCount;
if(bitSize < 0 || bitSize > Integer.MAX_VALUE){
throw new RuntimeException("位数太大溢出了,降低数据大小");
}
seeds = rate.getSeeds();
size = (int)bitSize;
bitMap = new BitSet(size);
}
/**
* @Author WangZhiHua
* @Description 添加一个数据
* @Date 17:05 2023-04-14
**/
public void add(String data){
//检测数据是否要刷新
checkNeedClear();
lock.lock();
try {
//保证对一个数据的操作是一个整体
for(int i = 0; i < seeds.length; i++){
int hashIndex = getHash(data,seeds[i]);
bitMap.set(hashIndex,true);
}
}finally {
lock.unlock();
}
}
/**
* @Author WangZhiHua
* @Description 返回false,说明这个数据不存在
* @Date 17:03 2023-04-14
**/
public boolean check(String data){
lock.lock();
try {
for(int i = 0; i < seeds.length; i++){
int hashIndex = getHash(data,seeds[i]);
if(bitMap.get(hashIndex) == false){
return false;
}
}
return true;
}finally {
lock.unlock();
}
}
public void checkNeedClear(){
lock.lock();
try {
int cardinality = bitMap.cardinality();
if(1.0 * cardinality / size >= autoClear){
bitMap.clear();
}
}finally {
lock.unlock();
}
}
public int getHash(String data, int base){
char[] value = data.toCharArray();
long res = 1;
for(int i = 0; i < value.length; i++){
res = (res * base + value[i]) % size;
}
return (int)(res%size);
}
}
测试类
package com.cn.bl;
import com.cn.bl.util.BloomFilter;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import java.util.HashSet;
import java.util.Random;
@SpringBootTest
class BlApplicationTests {
@Autowired
private BloomFilter bloomFilter;
@Test
void contextLoads() {
}
//测试时间
@Test
void testBoolFilterTime(){
//对100万个字符串进行查重 需要7MB
//用HashSet对100万个字符串进行查重,是上面的96倍
String[] ipStr = new String[100000];
String[] testStr = new String[100000];
HashSet<String> set =new HashSet<>();
Random r = new Random();
//模拟生成100万个ip地址
for(int i = 0; i < 100000; i++){
int a1=r.nextInt(256);
int a2=r.nextInt(256);
int a3=r.nextInt(256);
int a4=r.nextInt(256);
ipStr[i] = ""+a1+"."+a2+"."+a3+"."+a4;
}
//模拟生成100万个测试地址
for(int i = 0; i < 100000; i++){
int a1=r.nextInt(256);
int a2=r.nextInt(256);
int a3=r.nextInt(256);
int a4=r.nextInt(256);
testStr[i] = ""+a1+"."+a2+"."+a3+"."+a4;
}
int res = 0;
int fail = 0;
for(int i=0;i<100000;i++){
bloomFilter.add(ipStr[i]);
boolean add = set.add(ipStr[i]);
if(add){
res++;
}
}
System.out.println("插入有效字符串:"+res+"个");
for(int i=0;i<100000;i++){
boolean check = bloomFilter.check(testStr[i]);
boolean contains = set.contains(testStr[i]);
if(contains != check){
System.out.println(testStr[i]);
fail++;
}
}
System.out.println("错了: "+fail+"个");
System.out.println("错误率: "+1.0*fail/res*100+"%");
long st1=System.currentTimeMillis();
for(int i=0;i<100000;i++){
bloomFilter.check(testStr[i]);
}
long ed1=System.currentTimeMillis();
System.out.println("布隆过滤器比较100000个ip用了: "+(ed1-st1));
long st2=System.currentTimeMillis();
for(int i=0;i<100000;i++){
set.contains(testStr[i]);
}
long ed2=System.currentTimeMillis();
System.out.println("哈希表比较100000个ip用了: "+(ed2-st2));
}
}
主要测试代码
主要效果图
发现十万个ip通过16个哈希函数错误率只有0.079,空间缩小了100倍
通过32个哈希函数,错误率接近于0
主要代码放在了gitee上
https://gitee.com/nclg/blossom-filter