import scala.collection.mutable.ArrayBuffer
import scala.util.hashing.MurmurHash3
import scala.math.abs
/**
* BloomFilter基础参数如下:
* m - bitmap的长度
* n - 需要存储的元素个数
* k - hash函数的个数
* f - false postive
*/
class BloomFilter {
private val PRIME_TABLE = Array(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263,
269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331)
private val BYTE_SIZE: Int = 8
private var k: Int = _
private var m: Int = _
private var n: Int = _
private var bitmapCharArray: Array[Char] = _
private var seedArray: Array[Int] = _
/**
* 布隆过滤其中实际存储的元素个数,每执行一次put都会对nAct加一。使用时需要注意,只有经过过滤的元素才能使用put添加
* 到过滤器中,如果已经存在元素重复添加的过滤器会导致nAct不可用。如果使用场景中不需要用到nAct则不需要在意这一点
*/
private var nAct: Int = _
def this(bitmapStr: String, k: Int, n: Int, nAct: Int) = {
this()
bitmapCharArray = bitmapStr.toCharArray
m = bitmapCharArray.length * BYTE_SIZE
this.k = k
initSeedArray(k)
this.n = n
this.nAct = nAct
}
def this(bitmapStr: String, k: Int, seedArray: Array[Int], n: Int, nAct: Int) = {
this()
bitmapCharArray = bitmapStr.toCharArray
m = bitmapCharArray.length * BYTE_SIZE
this.k = k
if(seedArray.length == k){
this.seedArray = seedArray
}else{
initSeedArray(k)
}
this.n = n
this.nAct = nAct
}
def this(k: Int, m: Int, n: Int) = {
this()
this.k = k
initSeedArray(k)
bitmapCharArray = generateEmptyBitmap(m)
this.m = bitmapCharArray.length * BYTE_SIZE
this.n = n
this.nAct = nAct
}
/**
* 优先从质数表中取种子,当质数表中的质数不够时,补充不足的部分
*
* @param k
* @return
*/
private def initSeedArray(k: Int): Unit = {
if(k > PRIME_TABLE.length){
val arrayBuffer = ArrayBuffer[Int]()
arrayBuffer.appendAll(PRIME_TABLE)
var num = PRIME_TABLE.length
var trial = PRIME_TABLE.last + 1
while (num < k){
if(isPrime(trial)){
arrayBuffer.append(trial)
num = num + 1
}
trial = trial + 1
}
arrayBuffer.toArray
}else{
seedArray = PRIME_TABLE.take(k)
}
}
/**
* 生成所有bit位均为空的bitmapCharArray
*
* @param m
* @return
*/
private def generateEmptyBitmap(m: Int): Array[Char] = {
val charNum = (m.toDouble / BYTE_SIZE).ceil.toInt
val charArrayBuffer = ArrayBuffer[Char]()
val char = 0x00.toChar
for(_ <- 0 to (charNum - 1)){
charArrayBuffer.append(char)
}
charArrayBuffer.toArray
}
/**
* 判断字符串是否可能存在于过滤器中
*
* @param str
* @return
*/
def exists(str: String): Boolean = {
var flag = true
var s = 0
while (s < k){
val pos = hash(str, seedArray(s))
if(!getBit(pos)){
flag = false
s = k
}
s = s + 1
}
flag
}
/**
* 将字符串添加到过滤器中。
* 每执行一次put都会对nAct加1
*
* @param str
*/
def put(str: String): Unit = {
seedArray.foreach(seed => {
val pos = hash(str, seed)
setBit(pos)
})
addNAct()
}
/**
* 将bitmap的第pos个bit置位1
*
* @param pos
*/
private def setBit(pos: Int): Unit = {
val charPos = getCharPos(pos)
val char = bitmapCharArray(charPos)
val bitPos = pos - charPos * BYTE_SIZE
val byte = char.toByte
val mask = 0x01 << bitPos
val or = byte | mask
bitmapCharArray(charPos) = or.toChar
}
/**
* 读取bitmap的第pos个bit
*
* @param pos
* @return
*/
private def getBit(pos: Int): Boolean = {
val charPos = getCharPos(pos)
val char = bitmapCharArray(charPos)
val bitPos = pos - charPos * BYTE_SIZE
val byte = char.toByte
val mask = 0x01 << bitPos
val and = byte & mask
if(0 == and) false else true
}
/**
* 获取第pos个bit对应的char的位置(从0开始编号)
*
* @param pos
* @return
*/
private def getCharPos(pos: Int): Int = {
(pos.toDouble / BYTE_SIZE).toInt
}
/**
* 基于MurmurHash3算法计算字符串的hash值
*
* @param str
* @param seed
* @return
*/
private def hash(str: String, seed: Int): Int = {
abs(MurmurHash3.stringHash(str, seed)) % m
}
/**
* 判断是否为质数
*
* @param n
* @return
*/
private def isPrime(n: Int): Boolean = {
var flag= true
for(i <- 2 to n - 1 ){
if(n % i == 0)flag = false
}
flag
}
/**
* 获取bitmap对应的字符串表示
*
* @return
*/
def getBitmapStr(): String = {
bitmapCharArray.mkString
}
def getK():Int = k
def getM():Int = m
def getN():Int = n
def getNAct():Int = nAct
private def addNAct(): Unit = {
nAct = nAct + 1
}
}
object BloomFilter{
/**
*缺省情况下取m = 96000,k = 9, 在n = 8000的情况下可以实现3.1e-3的false positive rate
*/
val DEFAULT_M = 96000
val DEFAULT_K = 9
/**
* 计算最优的k及m
* @param n
* @param f
* @return
*/
def calculateOptimalKM(n: Int, f: Double): (Int, Int) = {
var m = DEFAULT_M
var k = DEFAULT_K
if(f >= 0 && n > 0 ){
m = (math.log(f) / math.log(0.3185) * n).ceil.toInt
k = (0.7 * m/n).ceil.toInt
}
(k, m)
}
}
object Test {
def main(args: Array[String]) {
val bloomFilter = new BloomFilter(3, 20, 5)
bloomFilter.put("abc")
bloomFilter.put("efg")
bloomFilter.put("hij")
val bl = bloomFilter.exists("abc")
println(bl)
val bl2 = bloomFilter.exists("abe")
println(bl2)
}
}
true
false
Process finished with exit code 0
关注微信公众号【飞哥大数据】,回复666 获取2022年100+公司面试真题,以及spark与flink面试题汇总