大规模数据去重(布隆过滤器)的使用案例
大规模数据去重(布隆过滤器)的使用案例
用于给大批量数据的去重判断。当面临海量的数据时,根据业务条件进行去重,最后得出去重后的结果。
常用的set在海量数据面前,消耗的存储空间和读写效率都比较大。故使用布隆过滤器做优化方案的选择。布隆过滤器的原理不做过多解释,只记录一次使用案例,供实战环境中的参考使用。
-
pom配置
com.google.guava
guava
30.0-jre
-
案例实体类,及去重逻辑设置
public class UserDemo {private String token;
private String phone;
public UserDemo(String token, String phone) {
this.token = token;
this.phone = phone;
}public static UserDemo create(String token, String phone) {
return new UserDemo(token, phone);
}@Override
//模拟去重条件,转换为string处理
public String toString() {
//优先把token作为去重条件,如果没有就使用phone
if(StringUtils.isEmpty(token)){
return phone;
}else {
return token;
}
}
} -
使用案例
private static int insertions = 10000000;//测试用数据量private static int limit1 = insertions / 2; //对一半的数据做额外区分
public static void main(String[] args) {
long start=System.currentTimeMillis();//1.初始化过滤器 BloomFilter<String> bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), insertions*2, 0.0001);//容量定为数据量的2倍,容错率定为0.0001 Set<String> sets = new HashSet<>(insertions);//用作判定集合 //2.放入数据 for (int i = 0; i < insertions; i++) { String uid = null; if (i < limit1) { uid = UserDemo.create("ABCDEFGHIJKL" + i, "b" + i).toString(); } else { uid = UserDemo.create(null, "b" + i).toString(); } bloomFilter.put(uid); sets.add(uid); } int right = 0; int wrong = 0; //3.Filter数据持久化 File f = new File("d:" + File.separator + "test"); try (OutputStream out = new FileOutputStream(f);) { bloomFilter.writeTo(out); } catch (IOException e) { e.printStackTrace(); } BloomFilter<String> bloomFilter2 = null; //4.将之前持久化的数据加载到Filter File f2 = new File("d:" + File.separator + "test"); try (InputStream in = new FileInputStream(f2);) { bloomFilter2 = BloomFilter.readFrom(in, Funnels.stringFunnel(Charset.defaultCharset())); } catch (IOException e) { e.printStackTrace(); } //5.重复数据判断 int allNum = insertions * 2; for (int i = 0; i < allNum; i++) { String data = null; if (i < limit1) { data = UserDemo.create("ABCDEFGHIJKL" + i, "b" + i).toString(); } else { data = UserDemo.create(null, "b" + i).toString(); } if (bloomFilter2.mightContain(data)) { if (sets.contains(data)) { right++; continue; } wrong++; } } //6.计算判断结果 NumberFormat percentFormat = NumberFormat.getPercentInstance(); percentFormat.setMaximumFractionDigits(2); float percent = (float) wrong / (allNum - insertions); float bingo = (float) ((allNum - insertions) - wrong) / (allNum - insertions); System.out.println("在" + allNum + "条数据中,判断 " + insertions + " 实际存在的元素,布隆过滤器认为存在的数量为:" + right); System.out.println("在" + allNum + "条数据中,判断" + insertions + "实际不存在的元素,布隆过滤器误认为存在的数量为:" + wrong); System.out.println("命中率为:" + percentFormat.format(bingo) + ",误判率为:" + percentFormat.format(percent)); long end=System.currentTimeMillis(); System.out.println("总计耗时"+(end-start)/1000+"秒");
}
-
1000万数据量模拟的执行结果