select * from table where column != "value";
这样一句SQL妇孺皆知,而且在MySQL/Oracle中column为
null
的也会被过滤掉;那么在Spark中表现又会是怎样呢?(Spark SQL也在悄悄得往SQL的语法标准上靠)
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object Test {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[1]").setAppName("test")
val spark = SparkSession.builder().config(conf).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val df = spark.createDataFrame(Seq(
(1, "A"),
(2, "B"),
(3, null)
)).toDF("id", "source")
import spark.implicits._
df.show()
//+---+------+
//| id|source|
//+---+------+
//| 1| A|
//| 2| B|
//| 3| null|
//+---+------+
df.filter($"source".notEqual("A")).show()
//+---+------+
//| id|source|
//+---+------+
//| 2| B|
//+---+------+
df.filter($"source".notEqual("A").or($"source".isNull)).show()
//+---+------+
//| id|source|
//+---+------+
//| 2| B|
//| 3| null|
//+---+------+
spark.stop()
}
}
由此可见,Spark中的表现跟MySQL一模一样(
notEqual同样会过滤null值,即使null是有业务意义的数据
),所以在处理数据过程中一定要注意:假如某列值可能出现null
,并且数据为有效数据
,那么在过滤数据时要小心使用条件以防止多过滤值为null却有效的数据
!