如何快速判断一个数据是否存在亿级数据中

最新推荐文章于 2021-02-02 04:37:51 发布

旧人可安

最新推荐文章于 2021-02-02 04:37:51 发布

阅读量396

点赞数

分类专栏：面试题

本文链接：https://blog.csdn.net/weixin_41325595/article/details/103505326

版权

面试题专栏收录该内容

6 篇文章 0 订阅

订阅专栏

package com.haowu.testBloomFilter;

public class BloomFilters {
    /**
     * 数组长度
     */
    private int arraySize;
    /**
     * 数组
     */
    private int[] array;

    public BloomFilters(int arraySize) {
        this.arraySize = arraySize;
        array = new int[arraySize];
    }

    /**
     * 写入数据
     * @param key
     */
    public void add(String key) {
        int first = hashcode_1(key);
        int second = hashcode_2(key);
        int third = hashcode_3(key);

        array[first % arraySize] = 1;
        array[second % arraySize] = 1;
        array[third % arraySize] = 1;
    }

    /**
     * 判断数据是否存在
     * @param key
     * @return
     */
    public boolean check(String key) {
        int first = hashcode_1(key);
        int second = hashcode_2(key);
        int third = hashcode_3(key);

        int firstIndex = array[first % arraySize];
        if (firstIndex == 0) {
            return false;
        }

        int secondIndex = array[second % arraySize];
        if (secondIndex == 0) {
            return false;
        }

        int thirdIndex = array[third % arraySize];
        if (thirdIndex == 0) {
            return false;
        }

        return true;
    }

    /**
     * hash 算法1
     * @param key
     * @return
     */
    private int hashcode_1(String key) {
        int hash = 0;
        int i;
        for (i = 0; i < key.length(); ++i) {
            hash = 33 * hash + key.charAt(i);
        }
        return Math.abs(hash);
    }

    /**
     * hash 算法2
     * @param data
     * @return
     */
    private int hashcode_2(String data) {
        final int p = 16777619;
        int hash = (int) 2166136261L;
        for (int i = 0; i < data.length(); i++) {
            hash = (hash ^ data.charAt(i)) * p;
        }
        hash += hash << 13;
        hash ^= hash >> 7;
        hash += hash << 3;
        hash ^= hash >> 17;
        hash += hash << 5;
        return Math.abs(hash);
    }

    /**
     *  hash 算法3
     * @param key
     * @return
     */
    private int hashcode_3(String key) {
        int hash, i;
        for (hash = 0, i = 0; i < key.length(); ++i) {
            hash += key.charAt(i);
            hash += (hash << 10);
            hash ^= (hash >> 6);
        }
        hash += (hash << 3);
        hash ^= (hash >> 11);
        hash += (hash << 15);
        return Math.abs(hash);
    }
}

package com.haowu.testBloomFilter;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import junit.framework.Assert;

/***
 * 布隆过滤--快速查询一个数据是否存在于亿级数据中
 * 1、只要返回数据不存在，则肯定不存在。
 * 2、返回数据存在，但只能是大概率存在。
 * 3、同时不能清除其中的数据
 */
public class TestBloomFilter {

    public static void main(String[] args) {
        guavaTest();
    }


    /**使用自己写的过滤器
     *
     * 虽然实现了功能，也满足了大量数据。但其实观察 GC 日志非常频繁，
     * 同时老年代也使用了 90%，接近崩溃的边缘。总的来说就是内存利用率做的不好
     * */
    public static void bloomFilterTest(){
        long star = System.currentTimeMillis();
        BloomFilters bloomFilters = new BloomFilters(10000000) ;
        for (int i = 0; i < 10000000; i++) {
            bloomFilters.add(i + "") ;
        }
        Assert.assertTrue(bloomFilters.check(-1+""));
        Assert.assertTrue(bloomFilters.check(2+""));
        Assert.assertTrue(bloomFilters.check(3+""));
        Assert.assertTrue(bloomFilters.check(999999+""));
        Assert.assertFalse(bloomFilters.check(400230340+""));
        long end = System.currentTimeMillis();
        System.out.println("执行时间：" + (end - star));
    }


    /**
     * Guava 实现的过滤器
     * GC 日志会发现没有一次 fullGC，同时老年代的使用率很低。
     * 和刚才的一对比这里明显的要好上很多，也可以写入更多的数据
     * */
    public static void guavaTest() {
        long star = System.currentTimeMillis();
        BloomFilter<Integer> filter = BloomFilter.create(
                Funnels.integerFunnel(),
                10000000,
                0.01);

        for (int i = 0; i < 10000000; i++) {
            filter.put(i);
        }

        System.out.println(filter.mightContain(-1));
        long end = System.currentTimeMillis();
        System.out.println("执行时间：" + (end - star));
    }


}

旧人可安

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
如何快速判断一个数据是否存在亿级数据中

package com.haowu.testBloomFilter;public class BloomFilters { /** * 数组长度 */ private int arraySize; /** * 数组 */ private int[] array; public BloomFilters(int a...
复制链接

扫一扫

专栏目录