scala实现布隆过滤器代码

import scala.collection.mutable.ArrayBuffer
import scala.util.hashing.MurmurHash3
import scala.math.abs
/**
  * BloomFilter基础参数如下:
  * m - bitmap的长度
  * n - 需要存储的元素个数
  * k - hash函数的个数
  * f - false postive
  */
class BloomFilter {
    private val PRIME_TABLE = Array(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
        59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
        157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263,
        269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331)
    private val BYTE_SIZE: Int = 8
    private var k: Int = _
    private var m: Int = _
    private var n: Int = _
    private var bitmapCharArray: Array[Char] = _
    private var seedArray: Array[Int] = _
    /**
      * 布隆过滤其中实际存储的元素个数,每执行一次put都会对nAct加一。使用时需要注意,只有经过过滤的元素才能使用put添加
      * 到过滤器中,如果已经存在元素重复添加的过滤器会导致nAct不可用。如果使用场景中不需要用到nAct则不需要在意这一点
      */
    private var nAct: Int = _

    def this(bitmapStr: String, k: Int, n: Int, nAct: Int) = {
        this()
        bitmapCharArray = bitmapStr.toCharArray
        m = bitmapCharArray.length * BYTE_SIZE
        this.k = k
        initSeedArray(k)
        this.n = n
        this.nAct = nAct
    }

    def this(bitmapStr: String, k: Int, seedArray: Array[Int], n: Int, nAct: Int) = {
        this()
        bitmapCharArray = bitmapStr.toCharArray
        m = bitmapCharArray.length * BYTE_SIZE
        this.k = k
        if(seedArray.length == k){
            this.seedArray = seedArray
        }else{
            initSeedArray(k)
        }
        this.n = n
        this.nAct = nAct

    }

    def this(k: Int, m: Int, n: Int) = {
        this()
        this.k = k
        initSeedArray(k)
        bitmapCharArray = generateEmptyBitmap(m)
        this.m = bitmapCharArray.length * BYTE_SIZE
        this.n = n
        this.nAct = nAct
    }

    /**
      * 优先从质数表中取种子,当质数表中的质数不够时,补充不足的部分
      *
      * @param k
      * @return
      */
    private def initSeedArray(k: Int): Unit = {
        if(k > PRIME_TABLE.length){
            val arrayBuffer = ArrayBuffer[Int]()
            arrayBuffer.appendAll(PRIME_TABLE)
            var num = PRIME_TABLE.length
            var trial = PRIME_TABLE.last + 1
            while (num < k){
                if(isPrime(trial)){
                    arrayBuffer.append(trial)
                    num = num + 1
                }
                trial = trial + 1
            }
            arrayBuffer.toArray
        }else{
            seedArray = PRIME_TABLE.take(k)
        }

    }

    /**
      * 生成所有bit位均为空的bitmapCharArray
      *
      * @param m
      * @return
      */
    private def generateEmptyBitmap(m: Int): Array[Char] = {
        val charNum = (m.toDouble / BYTE_SIZE).ceil.toInt
        val charArrayBuffer = ArrayBuffer[Char]()
        val char = 0x00.toChar
        for(_ <- 0 to (charNum - 1)){
            charArrayBuffer.append(char)
        }
        charArrayBuffer.toArray
    }

    /**
      * 判断字符串是否可能存在于过滤器中
      *
      * @param str
      * @return
      */
    def exists(str: String): Boolean = {
        var flag = true
        var s = 0
        while (s < k){
            val pos = hash(str, seedArray(s))
            if(!getBit(pos)){
                flag = false
                s = k
            }
            s = s + 1
        }
        flag
    }

    /**
      * 将字符串添加到过滤器中。
      * 每执行一次put都会对nAct加1
      *
      * @param str
      */
    def put(str: String): Unit = {
        seedArray.foreach(seed => {
            val pos = hash(str, seed)
            setBit(pos)
        })
        addNAct()
    }

    /**
      * 将bitmap的第pos个bit置位1
      *
      * @param pos
      */
    private def setBit(pos: Int): Unit = {
        val charPos = getCharPos(pos)
        val char = bitmapCharArray(charPos)
        val bitPos = pos - charPos * BYTE_SIZE
        val byte = char.toByte
        val mask = 0x01 << bitPos
        val or = byte | mask
        bitmapCharArray(charPos) = or.toChar
    }

    /**
      * 读取bitmap的第pos个bit
      *
      * @param pos
      * @return
      */
    private def getBit(pos: Int): Boolean = {
        val charPos = getCharPos(pos)
        val char = bitmapCharArray(charPos)
        val bitPos = pos - charPos * BYTE_SIZE
        val byte = char.toByte
        val mask = 0x01 << bitPos
        val and = byte & mask
        if(0 == and) false else true
    }

    /**
      * 获取第pos个bit对应的char的位置(从0开始编号)
      *
      * @param pos
      * @return
      */
    private def getCharPos(pos: Int): Int = {
        (pos.toDouble / BYTE_SIZE).toInt
    }

    /**
      * 基于MurmurHash3算法计算字符串的hash值
      *
      * @param str
      * @param seed
      * @return
      */
    private def hash(str: String, seed: Int): Int = {
        abs(MurmurHash3.stringHash(str, seed)) % m
    }

    /**
      * 判断是否为质数
      *
      * @param n
      * @return
      */
    private def isPrime(n: Int): Boolean = {
        var flag= true
        for(i <- 2 to n - 1 ){
            if(n % i == 0)flag = false
        }
        flag
    }

    /**
      * 获取bitmap对应的字符串表示
      *
      * @return
      */
    def getBitmapStr(): String = {
        bitmapCharArray.mkString
    }

    def getK():Int = k
    def getM():Int = m
    def getN():Int = n
    def getNAct():Int = nAct

    private def addNAct(): Unit = {
        nAct = nAct + 1
    }
}
object BloomFilter{
    /**
      *缺省情况下取m = 96000,k = 9, 在n = 8000的情况下可以实现3.1e-3的false positive rate
      */
    val DEFAULT_M = 96000
    val DEFAULT_K = 9

    /**
      * 计算最优的k及m
      * @param n
      * @param f
      * @return
      */
    def calculateOptimalKM(n: Int, f: Double): (Int, Int) = {
        var m = DEFAULT_M
        var k = DEFAULT_K
        if(f >= 0 && n > 0 ){
            m = (math.log(f) / math.log(0.3185) * n).ceil.toInt
            k = (0.7 * m/n).ceil.toInt
        }
        (k, m)
    }
}

object Test {

  def main(args: Array[String]) {
      val bloomFilter = new BloomFilter(3, 20, 5)
      bloomFilter.put("abc")
      bloomFilter.put("efg")
      bloomFilter.put("hij")
      val bl = bloomFilter.exists("abc")
      println(bl)
      val bl2 = bloomFilter.exists("abe")
      println(bl2)
  }
}


true
false

Process finished with exit code 0

关注微信公众号【飞哥大数据】,回复666 获取2022年100+公司面试真题,以及spark与flink面试题汇总

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值