import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object DFTest {
Logger.getLogger("org").setLevel(Level.ERROR)
Logger.getRootLogger().setLevel(Level.ERROR)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("DFTest")
.master("local[*]")
.getOrCreate()
import spark.implicits._
println("classDF")
val classDF = Seq(
(1, 1, "Alice", 90),
(1, 2, "Bob", 80),
(2, 1, "Charlie", 85),
(2, 2, "David", 95)
).toDF("class", "no", "name", "score")
classDF.show()
}
}
classDF
+-----+---+-------+-----+
|class| no| name|score|
+-----+---+-------+-----+
| 1| 1| Alice| 90|
| 1| 2| Bob| 80|
| 2| 1|Charlie| 85|
| 2| 2| David| 95|
+-----+---+-------+-----+
println("studentDF")
val studentDF = Seq(
(1, 1),
(2, 1)
).toDF("class", "no")
studentDF.show()
studentDF
+-----+---+
|class| no|
+-----+---+
| 1| 1|
| 2| 1|
+-----+---+
println("joinedDF")
val joinedDF = classDF.join(studentDF.withColumnRenamed("class", "s_class").withColumnRenamed("no", "s_no"), $"class" === $"s_class" && $"no" === $"s_no", "left_outer")
joinedDF.show()
joinedDF
+-----+---+-------+-----+-------+----+
|class| no| name|score|s_class|s_no|
+-----+---+-------+-----+-------+----+
| 1| 1| Alice| 90| 1| 1|
| 1| 2| Bob| 80| null|null|
| 2| 1|Charlie| 85| 2| 1|
| 2| 2| David| 95| null|null|
+-----+---+-------+-----+-------+----+
val resultDF = joinedDF.withColumn("flag", when(col("s_class").isNotNull && col("s_no").isNotNull, 1).otherwise(0)).drop("s_class", "s_no")
println("resultDF")
resultDF.show()
resultDF
+-----+---+-------+-----+----+
|class| no| name|score|flag|
+-----+---+-------+-----+----+
| 1| 1| Alice| 90| 1|
| 1| 2| Bob| 80| 0|
| 2| 1|Charlie| 85| 1|
| 2| 2| David| 95| 0|
+-----+---+-------+-----+----+