1.布隆过滤器
布隆过滤器:一种数据结构,是由一串很长的二进制向量组成,可以将其看成一个二进制数组。既然是二进制,那么里面存放的不是0,就是1,初始默认值都是0。
使用场景:在大数据量集合中,如何准确快速的判断某个数据是否存在,并且不占用大量内存。
比如,在一亿个手机号码中,判断某个手机号是否存在。
2.google guava 实现
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>28.0-jre</version>
</dependency>
@GetMapping("/test")
public void test(@RequestParam("total") int total,
@RequestParam(value = "fpp", required = false) String fpp) {
BloomFilter<String> bloomFilter;
if (fpp == null) {
// 默认误差是:0.03
bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), total);
} else {
bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charsets.UTF_8), total, Float.parseFloat(fpp));
}
Set<String> set = new HashSet<>(1000);
List<String> list = new ArrayList<>(1000);
// 布隆过滤器添加total个数据,比如100w
for (int i = 0; i < total; i++) {
String uuid = UUID.randomUUID().toString();
if (i < 1000) {
set.add(uuid);
list.add(uuid);
}
bloomFilter.put(uuid);
}
// 模拟10w个数据,其中1000个是布隆过滤器已记录的
int number = total / 10;
int right = 0; // 布隆过滤器正确次数
int wrong = 0; // 布隆过滤器误判次数
for (int i = 0; i < number; i++) {
int index = number / 1000;
String str = i % (index) == 0 ? list.get(i / index) : UUID.randomUUID().toString();
if (bloomFilter.mightContain(str)) {
if (set.contains(str)) {
right++;
} else {
wrong++;
}
}
}
System.out.println("布隆过滤器容器大小:" + total);
System.out.println("模拟数据量:" + number);
System.out.println("正确次数:" + right);
System.out.println("误判次数:" + wrong);
NumberFormat numberFormat = NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(4);
System.out.println("误判率:" + numberFormat.format((float) wrong / (float) number));
}
http://localhost:8080/test?total=100000
http://localhost:8080/test?total=1000000
http://localhost:8080/test?total=100000&fpp=0.01
http://localhost:8080/test?total=1000000&fpp=0.01
3.自定义实现
package com.yzm.redis11.bloom;
public abstract class MyBloomFilter {
//用来生成不同的hash值,可以随便给,但别给奇数
protected final int[] ints = {8, 22, 32, 66, 88, 128};
//布隆过滤器中数据个数
protected Integer count = 0;
//数据量
public abstract Integer count();
//清空数据
public abstract void clear();
//添加数据
public abstract void push(Object key);
//判断key是否存在,true不一定说明key存在,但是false一定说明不存在
public abstract boolean contains(Object key);
//hash算法
public abstract int hash(Object key, int i);
}
package com.yzm.redis11.bloom;
import java.util.BitSet;
public class DefaultBloomFilter extends MyBloomFilter{
//bit集合,用来存放结果
private final int DEFAULT_SIZE = Integer.MAX_VALUE;
private final BitSet bitSet = new BitSet(DEFAULT_SIZE);
public DefaultBloomFilter() {
}
public Integer count() {
return count;
}
public void clear() {
bitSet.clear();
count = 0;
}
public void push(Object key) {
for (int i : ints) {
bitSet.set(hash(key, i));
}
count++;
}
public boolean contains(Object key) {
for (int i : ints) {
boolean exist = bitSet.get(hash(key, i));
if (!exist) return false;
}
return true;
}
public int hash(Object key, int i) {
int h;
int index = key == null ? 0 : (DEFAULT_SIZE - 1 - i) & ((h = key.hashCode()) ^ (h >>> 16));
//bitSet下标不能小于0
return index > 0 ? index : -index;
}
}
@GetMapping("/test2")
public void test2(@RequestParam("total") int total) {
MyBloomFilter bf = new DefaultBloomFilter();
bf.clear();
Set<String> set = new HashSet<>(1000);
List<String> list = new ArrayList<>(1000);
//向布隆过滤器中填充数据
for (int i = 0; i < total; i++) {
String uuid = UUID.randomUUID().toString();
if (i < 1000) {
set.add(uuid);
list.add(uuid);
}
bf.push(uuid);
}
int number = total / 10;
int right = 0; // 布隆过滤器正确次数
int wrong = 0; // 布隆过滤器误判次数
for (int i = 0; i < number; i++) {
int index = number / 1000;
String str = i % (index) == 0 ? list.get(i / index) : UUID.randomUUID().toString();
if (bf.contains(str)) {
if (set.contains(str)) {
right++;
} else {
wrong++;
}
}
}
System.out.println("布隆过滤器容器大小:" + total);
System.out.println("模拟数据量:" + number);
System.out.println("正确次数:" + right);
System.out.println("误判次数:" + wrong);
NumberFormat numberFormat = NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(4);
System.out.println("误判率:" + numberFormat.format((float) wrong / (float) number));
}
http://localhost:8080/test2?total=100000 (3次)
http://localhost:8080/test2?total=1000000 (3次)
package com.yzm.redis11.bloom;
import org.springframework.data.redis.core.StringRedisTemplate;
public class RedisBloomFilter extends MyBloomFilter {
private final String bloom = "y_bloom";
private final StringRedisTemplate stringRedisTemplate;
public RedisBloomFilter(StringRedisTemplate stringRedisTemplate) {
//向布隆过滤器中填充数据
this.stringRedisTemplate = stringRedisTemplate;
}
public Integer count() {
return count;
}
public void clear() {
stringRedisTemplate.delete(bloom);
count = 0;
}
public void push(Object key) {
for (int i : ints) {
stringRedisTemplate.opsForValue().setBit(bloom, hash(key, i), true);
}
count++;
}
public boolean contains(Object key) {
for (int i : ints) {
Boolean exist = stringRedisTemplate.opsForValue().getBit(bloom, hash(key, i));
if (exist != null && !exist) return false;
}
return true;
}
public int hash(Object key, int i) {
int h;
int index = key == null ? 0 : (Integer.MAX_VALUE - 1 - i) & ((h = key.hashCode()) ^ (h >>> 16));
// offset偏移量是正整数
return index > 0 ? index : -index;
}
}
MyBloomFilter bf;
Set<String> set;
List<String> list;
@GetMapping("/test3")
public void test3(@RequestParam("total") int total, boolean first) {
// 向redis添加数据比较久,就第一次添加,一般都是提前添加好并日常积累
if (first) {
bf = new RedisBloomFilter(stringRedisTemplate);
set = new HashSet<>(1000);
list = new ArrayList<>(1000);
bf.clear();
//向布隆过滤器中填充数据
for (int i = 0; i < total; i++) {
String uuid = UUID.randomUUID().toString();
if (i < 1000) {
set.add(uuid);
list.add(uuid);
}
bf.push(uuid);
}
}
int number = total / 10;
int right = 0; // 布隆过滤器正确次数
int wrong = 0; // 布隆过滤器误判次数
for (int i = 0; i < number; i++) {
int index = number / 1000;
String str = i % (index) == 0 ? list.get(i / index) : UUID.randomUUID().toString();
if (bf.contains(str)) {
if (set.contains(str)) {
right++;
} else {
wrong++;
}
}
}
System.out.println("布隆过滤器容器大小:" + total);
System.out.println("模拟数据量:" + number);
System.out.println("正确次数:" + right);
System.out.println("误判次数:" + wrong);
NumberFormat numberFormat = NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(4);
System.out.println("误判率:" + numberFormat.format((float) wrong / (float) number));
}
http://localhost:8080/test3?total=100000&first=true
http://localhost:8080/test3?total=100000&first=false (2次)
http://localhost:8080/test3?total=1000000&first=false (3次)
4. redisbloom 实现
首先需要在Redis服务器上安装 redisbloom
下载地址:https://gitcode.net/mirrors/RedisLabsModules/redisbloom/-/releases
wget https://github.com/RedisLabsModules/rebloom/archive/v2.2.9.tar.gz
tar -zxvf v2.2.9.tar.gz
cd RedisBloom-2.2.9/
make
看到:redisbloom.so
redis.conf 配置redisbloom.so路径
vim ./etc/redis.conf
loadmodule /usr/local/redis/RedisBloom-2.2.9/redisbloom.so
启动redis
./bin/redis-server ./etc/redis.conf
package com.yzm.redis11.bloom;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.data.redis.core.script.DefaultRedisScript;
import org.springframework.data.redis.core.script.RedisScript;
import org.springframework.stereotype.Component;
import java.util.Arrays;
import java.util.List;
/**
* redis布隆过滤器
*/
@Component
public class RedisBloom {
private final StringRedisTemplate redisTemplate;
public RedisBloom(StringRedisTemplate redisTemplate) {
this.redisTemplate = redisTemplate;
}
private static RedisScript<Boolean> reserve = new DefaultRedisScript<>("return redis.call('bf.reserve', KEYS[1], ARGV[1], ARGV[2])", Boolean.class);
private static RedisScript<Boolean> add = new DefaultRedisScript<>("return redis.call('bf.add', KEYS[1], ARGV[1])", Boolean.class);
private static RedisScript<Boolean> exists = new DefaultRedisScript<>("return redis.call('bf.exists', KEYS[1], ARGV[1])", Boolean.class);
private static String mAdd = "return redis.call('bf.madd', KEYS[1], %s)";
private static String mExists = "return redis.call('bf.mexists', KEYS[1], %s)";
/**
* 设置错误率和大小(需要在添加元素前调用,若已存在元素,则会报错)
* 错误率越低,需要的空间越大
* bf.reserve key errorRate initialSize
*
* @param key
* @param errorRate 错误率,默认0.01
* @param initialSize 默认100,预计放入的元素数量,当实际数量超出这个数值时,误判率会上升,尽量估计一个准确数值再加上一定的冗余空间
*/
public Boolean reserve(String key, String errorRate, String initialSize) {
return redisTemplate.execute(reserve, Arrays.asList(key), errorRate, initialSize);
}
/**
* 添加元素
* bf.add key value
*
* @return true表示添加成功,false表示添加失败(存在时会返回false)
*/
public Boolean add(String key, String value) {
return redisTemplate.execute(add, Arrays.asList(key), value);
}
/**
* 查看元素是否存在(判断为存在时有可能是误判,不存在是一定不存在)
* bf.exists key value
*
* @return true表示存在,false表示不存在
*/
public Boolean exists(String key, String value) {
return redisTemplate.execute(exists, Arrays.asList(key), value);
}
/**
* 批量添加元素
* bf.madd key v1,v2,v3...
*
* @return 按序 1表示添加成功,0表示添加失败
*/
public List<Integer> mAdd(String key, String... values) {
return (List<Integer>) redisTemplate.execute(this.generateScript(mAdd, values), Arrays.asList(key), values);
}
/**
* 批量检查元素是否存在(判断为存在时有可能是误判,不存在是一定不存在)
* bf.mexists key v1,v2,v3...
*
* @return 按序 1表示存在,0表示不存在
*/
public List<Integer> mExists(String key, String... values) {
return (List<Integer>) redisTemplate.execute(this.generateScript(mExists, values), Arrays.asList(key), values);
}
private RedisScript<List> generateScript(String script, String[] values) {
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= values.length; i++) {
if (i != 1) {
sb.append(",");
}
sb.append("ARGV[").append(i).append("]");
}
return new DefaultRedisScript<>(String.format(script, sb), List.class);
}
}
@GetMapping("/test4")
public void test4() {
String key = "y_test";
// 配置错误率和存储空间
System.err.println(redisBloom.reserve(key, "0.01", "10000"));
// 添加元素
System.err.println(redisBloom.add(key, "周星驰"));
// 判断元素是否存在
System.err.println(redisBloom.exists(key, "周星星"));
// 批量添加元素
System.err.println(redisBloom.mAdd(key, "张学友", "刘德华", "郭富城", "黎明"));
// 批量判断元素是否存在
System.err.println(redisBloom.mExists(key, "张学友", "刘德华", "郭德纲", "黎明"));
}
http://localhost:8080/test4
Set<String> set5;
List<String> list5;
@GetMapping("/test5")
public void test5(@RequestParam("total") int total, String rate, boolean first) {
String key = "y_test5";
if (first) {
redisBloom.reserve(key, rate, String.valueOf(total));
set5 = new HashSet<>(1000);
list5 = new ArrayList<>(1000);
//向布隆过滤器中填充数据
for (int i = 0; i < total; i++) {
String uuid = UUID.randomUUID().toString();
if (i < 1000) {
set5.add(uuid);
list5.add(uuid);
}
redisBloom.add(key, uuid);
}
}
int number = total / 10;
int right = 0; // 布隆过滤器正确次数
int wrong = 0; // 布隆过滤器误判次数
for (int i = 0; i < number; i++) {
int index = number / 1000;
String str = i % (index) == 0 ? list5.get(i / index) : UUID.randomUUID().toString();
if (redisBloom.exists(key, str)) {
if (set5.contains(str)) {
right++;
} else {
wrong++;
}
}
}
System.out.println("布隆过滤器容器大小:" + total);
System.out.println("模拟数据量:" + number);
System.out.println("正确次数:" + right);
System.out.println("误判次数:" + wrong);
NumberFormat numberFormat = NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(4);
System.out.println("误判率:" + numberFormat.format((float) wrong / (float) number));
}
http://localhost:8080/test5?total=100000&rate=0.05&first=true
http://localhost:8080/test5?total=100000&rate=0.05&first=false (2次)
http://localhost:8080/test5?total=1000000&rate=0.05&first=false (3次)
5.redisson 实现
<dependency>
<groupId>org.redisson</groupId>
<artifactId>redisson-spring-boot-starter</artifactId>
<version>3.15.3</version>
</dependency>
package com.yzm.redis11.config;
import org.redisson.Redisson;
import org.redisson.api.RedissonClient;
import org.redisson.config.Config;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class RedissonConfig {
/**
* 单机模式
*/
@Bean(name = "singleServer", destroyMethod = "shutdown")
public RedissonClient useSingleServer() {
Config config = new Config();
config.useSingleServer()
.setAddress("redis://192.168.8.128:6379")
.setPassword("1234")
.setDatabase(0);
return Redisson.create(config);
}
}
Set<String> set6;
List<String> list6;
@GetMapping("/test6")
public void test6(@RequestParam("total") int total, String rate, boolean first) {
RBloomFilter<String> bloomFilter = redissonClient.getBloomFilter("y_test6");
if (first) {
bloomFilter.tryInit(total, Double.parseDouble(rate));
set6 = new HashSet<>(1000);
list6 = new ArrayList<>(1000);
//向布隆过滤器中填充数据
for (int i = 0; i < total; i++) {
String uuid = UUID.randomUUID().toString();
if (i < 1000) {
set6.add(uuid);
list6.add(uuid);
}
bloomFilter.add(uuid);
}
}
int number = total / 10;
int right = 0; // 布隆过滤器正确次数
int wrong = 0; // 布隆过滤器误判次数
for (int i = 0; i < number; i++) {
int index = number / 1000;
String str = i % (index) == 0 ? list6.get(i / index) : UUID.randomUUID().toString();
if (bloomFilter.contains(str)) {
if (set6.contains(str)) {
right++;
} else {
wrong++;
}
}
}
System.out.println("布隆过滤器容器大小:" + total);
System.out.println("模拟数据量:" + number);
System.out.println("正确次数:" + right);
System.out.println("误判次数:" + wrong);
NumberFormat numberFormat = NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(4);
System.out.println("误判率:" + numberFormat.format((float) wrong / (float) number));
}
http://localhost:8080/test6?total=100000&rate=0.03&first=true
http://localhost:8080/test6?total=100000&rate=0.03&first=false (2次)
http://localhost:8080/test6?total=1000000&rate=0.03&first=false (3次)
改错误率
http://localhost:8080/test6?total=100000&rate=0.01&first=true
http://localhost:8080/test6?total=100000&rate=0.01&first=false (2次)
http://localhost:8080/test6?total=1000000&rate=0.01&first=false (3次)