object SparkAutoBloomFilterDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate()
import spark.implicits._
//注意!id_不能为数字,否则无效!!!!!
// val df = Seq(
// ("dsad11", "First Value3"),
// ("dsad12", "Second Value4")
// ).toDF("id_", "name")
// val df2 = Seq(
// ("dsad11", "First Value31"),
// ("dsad12", "Second Value41"),
// ("dsad121", "Second Value4"),
// ("dsa3d11", "Second Value4")
// ).toDF("id_", "name")
val df:DataFrame = getGenerDataframe(3,spark)
val df2:DataFrame = df
println("=====df")
df.show
println("=====df2")
df2.show
val bf = df.stat.bloomFilter("id_",df.count().toInt,0.01)
//val bf = df.stat.bloomFilter("id",0,0.01)//如果是o的话会报错Expected insertions must be positive
// val rightNum = rdd.map(x=>(x.toInt,bf.mightContainString(x)))
println("=====dfmight")
df2.where(!might_contain(bf)($"id_")).show
}
def might_contain(f: org.apache.spark.util.sketch.BloomFilter) = udf((x: String) =>
if(x != null) f.mightContain(x) else false)
}
要记住布隆过滤器去重的那一列不能是数字