@羲凡——只为了更好的活着
Spark 布隆过滤器(BloomFilter)的应用
object BloomFilterDemo {
case class General(name:String,age:Int)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("BloomFilterDemo")
.master("local[*]")
.getOrCreate()
val output = "D:\\aarontest\\data\\777.txt"
import spark.implicits._
val df = spark.sparkContext.parallelize(Seq(
General("张辽",227), General("司马懿",188), General("扎克",100)
)).toDF
df.show(false)
val rdd = spark.sparkContext.parallelize(Seq("赵信","司马懿","扎克"))
// 生成bloomFilter
val bf = df.stat.bloomFilter("name",20L,0.01)
// 将bloomFilter写出
val bos = new BufferedOutputStream(new FileOutputStream(output))
bf.writeTo(bos)
bos.close()
// 将bloomFilter读入
val bis = new BufferedInputStream(new FileInputStream(output))
val filter = BloomFilter.readFrom(bis)
bis.close()
// 判定数据是否存在
val resRdd = rdd.map(x=>(x,filter.mightContainString(x)))
resRdd.foreach(println)
spark.stop()
}
}
参考资料:https://spark.apache.org/docs/2.3.2/api/java/org/apache/spark/util/sketch/BloomFilter.html
https://www.cnblogs.com/itboys/p/11109478.html
====================================================================
@羲凡——只为了更好的活着
若对博客中有任何问题,欢迎留言交流