背景
项目里需要对一个DataFrame,根据一个字段(country_id)新建出另一个字段(new_country_id),因此采用withColumn + udf的方式。但是country_id字段有null值,这使得udf失效。
代码
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType
object Main {
def main(args: Array[String]): Unit = {
val spark = new SparkSession.Builder().appName("planner")
.master("local[*]")
.config("spark.driver.host", "127.0.0.1")
.getOrCreate()
val myUDF = udf((countryID: Long) => {
3L
})
import spark.implicits._
val myDF = spark.sparkContext.parallelize(
Seq(1L, 2L)
).toDF("id")
.withColumn("country_id", when($"id" === 1 , lit(null).cast(LongType)).otherwise(lit(1)))
.withColumn("new_country_id", myUDF($"country_id"))
myDF.show(false)