spark-sql支持not in
val conf = new SparkConf().setAppName("spark_sql").setMaster("local[2]")
val sc = SparkContext.getOrCreate(conf)
val pathes = Array("","","")
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val start:Long = 0L
val end:Long = 1L
sqlContext.udf.register("transfer",(str:String)=>JSON.parseObject(str).getString("words"))
val selects = Array("publishTime","time","id","age","name","keywords")
val dataFrame = sqlContext.read.parquet(pathes:_*).filter($"time".geq(start) and $"time".lt(end)).selectExpr(selects:_*)
dataFrame.registerTempTable("data")
val ids = Array("59a22a8724eedd9c84030985","59a22a8724eedd9c84030985","59a22a8724eedd9c84030985",
"59a22a8724eedd9c84030985","59a22a8724eedd9c84030985","59a22a8724eedd9c84030985","59a22a8724eedd9c84030985",
"59a22a8724eedd9c84030985","59a22a8724eedd9c84030985","59a22a8724eedd9c84030985")
val filterIds = ids.map(id=>"'"+id+"'").mkString(",")
sqlContext.sql(
s"""
|select * from data where id not int($filterIds)
""".stripMargin)
方案二:
val idDF = sc.makeRDD(ids).map(id=>(id,1)).toDF("id","age")
dataFrame.join(idDF,Seq("id"),"left_outer").where($"age".isNull)